In [1]:
import cv2
import os
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from ast import literal_eval
from mpl_toolkits.mplot3d import Axes3D
from timeit import default_timer as timer
import itertools

from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from scikitplot.metrics import plot_roc, auc
import scikitplot as skplt

from sklearn.externals import joblib
from sklearn import datasets
from sklearn.model_selection import GridSearchCV

**Hiper Parametrización**

Hiper Parametrización: para cada modelo svm, dummy, kneighbords, etc., debo aplicar gridserachsv,
se trata de un metodo que agarra cada modelo y le va cambiando los parametros automaticamente, 
y asi nos dice que parametros son los mejores para ese modelo.
Por ejemplo, nosotros usamos svm polinomico de grado 2, pero capaz que el mejor modelo es un svm polinomico de grado 5,
o uno lineal....
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

**Levantamos el dataframe**

In [2]:
df = pd.read_csv('data_frame_imagenes', sep=',')

In [3]:
## Divido datos de entrenamiento y datos de prueba ###
df_datos = df
#X = df_datos[df_datos.columns[:-1]].values
X = df_datos[df_datos.columns[1:-1]].values # Desde la columna 1 hasta la columna anteultima.
y = df_datos[df_datos.columns[len(df_datos.columns)-1]].values # Solamente la ultima columna

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=42, 
)

**SVM**

Fuente: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [5]:
param_grid = { 
    'kernel': ('linear', 'rbf', 'poly'),
    'C': [1, 10],
    'degree' : [2,3]
}

In [6]:
svc = svm.SVC(gamma="scale")
clf = GridSearchCV(svc, param_grid=param_grid, cv=5)
clf.fit(X_train, y_train) 

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf', 'poly'), 'C': [1, 10], 'degree': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
print(clf.best_estimator_)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [8]:
print(clf.best_index_)

7


In [9]:
print(clf.best_score_)

0.5647222222222222


In [10]:
print(clf.best_params_)

{'C': 10, 'degree': 2, 'kernel': 'rbf'}


In [11]:
pred=clf.predict(X_test)

In [12]:
print("Accuracy de SVM: ",accuracy_score(y_test,pred))

Accuracy de SVM:  0.6166666666666667


**RandomForestClassifier**

Fuente: 

https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [13]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [14]:
rfc=RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
print(CV_rfc.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [16]:
print(CV_rfc.best_index_)

55


In [17]:
print(CV_rfc.best_score_)

0.5573611111111111


In [18]:
print(CV_rfc.best_params_)

{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}


In [19]:
pred=CV_rfc.predict(X_test)

In [20]:
print("Accuracy de RandomForestClassifier: ",accuracy_score(y_test,pred))

Accuracy de RandomForestClassifier:  0.5844444444444444


**KNeighborsClassifier**

Fuente: https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/

In [21]:
param_grid = { 
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [24]:
knn = KNeighborsClassifier(n_neighbors=5)
grid = GridSearchCV(knn, param_grid=param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
print(grid.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')


In [26]:
print(grid.best_index_)

0


In [27]:
print(grid.best_score_)

0.5968055555555556


In [28]:
print(grid.best_params_)

{'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'}


In [29]:
pred=grid.predict(X_test)

In [30]:
print("Accuracy de KNeighborsClassifier: ",accuracy_score(y_test,pred))

Accuracy de KNeighborsClassifier:  0.6316666666666667
