<h1 align=center> Ensemble of models </h1>
<h3 align=center> Random forest, SVM, K-nearest neighbors</h3>

# Import libraries

In [4]:
import os
import skimage
from skimage import io
import scipy
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib


  from numpy.core.umath_tests import inner1d


# Load training, testing, validation dataset

In [2]:
train_path = os.path.join("datasets", "ASL", "training _data")
test_path = os.path.join("datasets", "ASL", "test_data")
valid_path = os.path.join("datasets", "ASL", "validation_data")

def load_data(data_dir):
    # Get all subdirectories of data_dir. Each represents a label.
    directories = [d for d in os.listdir(data_dir) 
                   if os.path.isdir(os.path.join(data_dir, d))]
    # Loop through the label directories and collect the data in
    # two lists, labels and images.
    labels = []
    images = []
    for d in directories:
        label_dir = os.path.join(data_dir, d)
        file_names = [os.path.join(label_dir, f) 
                      for f in os.listdir(label_dir) 
                      if (f.endswith(".jpg")) | (f.endswith(".JPG")) ]
        for f in file_names:
            images.append(skimage.data.imread(f))
            labels.append((d))
    return images, labels


images, labels = load_data(train_path)
len(images)

5831

In [9]:
# def load_data(data_dir, Class):
#     Random='Random'
    
#     images=[]
#     labels=[]
#     label_dir = os.path.join(data_dir, Class,Random)
#     file_names = [os.path.join(label_dir, f) 
#                 for f in os.listdir(label_dir) 
#                 if (f.endswith(".jpg")) | (f.endswith(".JPG")) ]
#     for f in file_names:
#         images.append(skimage.data.imread(f))
#         labels.append((Class))
#     return images, labels
# images,labels =load_data("datasets/new_arabic_sign_language","al")
# print(len(images))
# print(len(labels))

71
71


# preparing training and testing dataset for training

In [3]:
X = np.array(images)
y = np.array(labels)

np.random.seed(42)
rnd_idx = np.random.permutation(5831)
X = X[rnd_idx]
y = y[rnd_idx]

X_train = X[:4664]
y_train = y[:4664]
X_test = X[4664:]
y_test = y[4664:]

nsamples, nx, ny = X_train.shape
X_train_2d = X_train.reshape(nsamples,nx*ny)

nsamples, nx, ny = X_test.shape
X_test_2d = X_test.reshape(nsamples,nx*ny)

# standardizing dataset

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_2d.astype(np.float32))
X_test_scaled = scaler.transform(X_test_2d.astype(np.float32))

# Creating ensemble of 3 different models: 
## Random forest, SVM and Kneighbors

In [7]:
Random_forest = joblib.load('Random_Forest_with_Random_data_and_Grid_search.pkl')
SVM = joblib.load('SVM_with_Random_data_and_Grid_search.pkl')
KNeighhbors = joblib.load('KNeighborsClassifier_with_Random_data_and_Grid_search.pkl')


In [8]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf", Random_forest),
    ("K_Neighhbors", KNeighhbors),
    ("svm_clf", SVM),
]


In [9]:
voting_clf = VotingClassifier(named_estimators)

# Training the ensemble

In [12]:
voting_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ......... n_neighbors=3, weights=uniform, score=0.958199 -  30.2s
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ......... n_neighbors=3, weights=uniform, score=0.973205 -  30.7s
[CV] n_neighbors=3, weights=distance .................................
[CV] ......... n_neighbors=3, weights=uniform, score=0.968917 -  31.5s
[CV] n_neighbors=3, weights=distance .................................
[CV] ......... n_neighbors=3, weights=uniform, score=0.966774 -  31.3s
[CV] n_neighbors=3, weights=distance .................................
[CV] ......... n_neighbors=3, weights=uniform, score=0.973176 -  33.3s
[CV] n_neighbors

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 16.8min


[CV] ......... n_neighbors=5, weights=uniform, score=0.965665 -  35.5s
[CV] n_neighbors=5, weights=distance .................................
[CV] ........ n_neighbors=5, weights=distance, score=0.958199 -  33.5s
[CV] n_neighbors=5, weights=distance .................................
[CV] ........ n_neighbors=5, weights=distance, score=0.971061 -  34.4s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] ........ n_neighbors=5, weights=distance, score=0.969989 -  34.3s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] ........ n_neighbors=5, weights=distance, score=0.961415 -  34.7s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] ........ n_neighbors=5, weights=distance, score=0.968884 -  35.4s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] ......... n_neighbors=8, weights=uniform, score=0.948553 -  35.3s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] .

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 37.5min finished


VotingClassifier(estimators=[('random_forest_clf', GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
        ...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         n_jobs=1, voting='hard', weights=None)

# Evaluating results of training dataset

In [13]:
voting_clf.score(X_train_scaled, y_train)

  if diff:


1.0

# Evaluating results of testing dataset

In [14]:
voting_clf.score(X_test_scaled, y_test)

  if diff:


0.9777206512425022

In [17]:
voting_clf.voting = "hard"

In [18]:
voting_clf.score(X_test_scaled, y_test)

  if diff:


0.9777206512425022

# saving model

In [20]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(voting_clf, 'Ensemble.pkl') 

['Ensemble.pkl']