In [1]:
from graphviz import Digraph

g = Digraph('G')
g.attr(rankdir='LR')
g.node_attr.update(style='filled', fillcolor='white', shape='box')
colors = ['green','red','blue','orange']
with g.subgraph(name='cluster_0') as d:
	d.attr(style='filled', fillcolor='antiquewhite1')
	d.attr(rank='same')
	d.edge('../input/digit-recognizer/train.csv','train')
	d.edge('../input/digit-recognizer/test.csv','test')
	d.edge('train','rmNull')
	d.edge('test','rmNull')
	d.edge('rmNull','rmOutliers')
	d.edge('rmOutliers','PreProssTrain')
	d.edge('rmOutliers','PreProssTest')
	d.attr(label='Data pre-processing')
with g.subgraph(name='cluster_1') as t:
	t.attr(style='filled', fillcolor='palegreen3')
	t.attr(rank='same')
	t.node('t1', style='filled', fillcolor='thistle')
	t.edge('PreProssTrain','t1')
	t.edge('PreProssTest','t1')
	t.edge('t1','t1_minMax01')
	t.edge('t1_minMax01','t1_pca01')
	t.node('t2', style='filled', fillcolor='thistle')
	t.edge('PreProssTrain','t2')
	t.edge('PreProssTest','t2')
	t.edge('t2','t2_minMax01')
	t.edge('t2_minMax01','t2_pca02')
	t.attr(label='Data Transformation')
with g.subgraph(name='cluster_2') as tr:
	tr.attr(style='filled', fillcolor='paleturquoise')
	tr.attr(rank='same')
	tr.node('gaussian1', style='filled', fillcolor='peachpuff')
	tr.edge('t1_pca01', 'gaussian1')
	tr.node('knn1', style='filled', fillcolor='peachpuff')
	tr.edge('t2_pca02', 'knn1')
	tr.node('knn2', style='filled', fillcolor='peachpuff')
	tr.edge('t2_pca02', 'knn2')
	tr.attr(label='Training')
with g.subgraph(name='cluster_3') as val:
	val.attr(style='filled', fillcolor='antiquewhite3')
	val.attr(rank='same')
	val.node('cross_validate')
	val.edge('gaussian1','cross_validate')
	val.edge('knn1','cross_validate')
	val.attr(label='Validation')
with g.subgraph(name='cluster_4') as comp:
	comp.attr(style='filled', fillcolor='antiquewhite2')
	comp.attr(rank='same')
	comp.node('cross_validate')
	comp.node('compare',shape='record', label='comparison |{ test_acc weight 10 | fit_time weight 3 }')
	comp.edge('cross_validate','compare')
	comp.attr(label='Comparison')
with g.subgraph(name='cluster_5') as report:
	report.attr(style='filled', fillcolor='azure2')
	report.attr(rank='same')
	report.node('REPORT',shape='record', label='REPORT | { ARRAY RESULT | CLOUD DOT  }')
	report.edge('compare','REPORT')
	report.attr(label='Display Report')
g

IMPORTS

In [2]:
%pylab inline
#%pylab
import numpy as np
from sklearn import datasets, svm, metrics
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score,precision_score, f1_score, roc_curve, auc, make_scorer,roc_auc_score 
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import operator


PREPROCESSING

DATASET IMPORT

In [3]:
## DATASET IMPORT
import pandas as pd
try:
	dataTrainSet = pd.read_csv('../input/digit-recognizer/train.csv')
	dataTestSet = pd.read_csv('../input/digit-recognizer/test.csv')
except FileNotFoundError:
	print('The path of the dataset is invalid')

PREPROCESSING

In [4]:
###### ---- PREPROCESSING PHASE ---- ######
## PREPROCESS : RMNULL 
dataTrainSet.dropna()

dataTestSet.dropna()

## PREPROCESS : RMOUTLIERS 
Q1=dataTrainSet.quantile(0.01)
Q3=dataTrainSet.quantile(0.8)
IQR = Q3 - Q1
dataTrainSet[~((dataTrainSet < (Q1 - 1.5 * IQR)) | (dataTrainSet > (Q3 + 1.5 * IQR))).any(axis = 1)]

Q1=dataTestSet.quantile(0.01)
Q3=dataTestSet.quantile(0.8)
IQR = Q3 - Q1
dataTestSet[~((dataTestSet < (Q1 - 1.5 * IQR)) | (dataTestSet > (Q3 + 1.5 * IQR))).any(axis = 1)]



SPLITTING

In [5]:
##DATASET SPLIT
X_train, y_train = dataTrainSet.drop(['label'], axis = 1), dataTrainSet['label']


TRANSFORMATION

In [6]:
###### ---- TRANSFORMATION PHASE ---- ######
# MinMax TRANSFORMATION
minMax01 = MinMaxScaler(feature_range=(0,1),clip=False,copy=True)

# PCA TRANSFORMATION
pca01 = PCA(n_components=0.62)

pca02 = PCA(n_components=0.41)

# STANDARDSCALER TRANSFORMATION
standardScaler1 = StandardScaler(copy=True, with_mean=True, with_std=True)

 ## ---- TRANSFORMATION PROCESSING ---- ##
### Transformation : 1
X_train_minMax01 = minMax01.fit_transform(X_train)
X_train_t1 = pca01.fit_transform(X_train_minMax01)
### Transformation : 2
X_train_minMax01 = minMax01.fit_transform(X_train)
X_train_t2 = pca02.fit_transform(X_train_minMax01)


TRAINING

KNN CLASSIFIER

In [7]:
kfold_knn1=StratifiedKFold(n_splits=2, shuffle = True)
pipe_knn1= Pipeline([('clf_knn', KNeighborsClassifier())])
distribution_knn1_param={'clf_knn__n_neighbors': sp_randint(1,11),'clf_knn__algorithm': ['auto'] }
rs_knn1 =RandomizedSearchCV(estimator= pipe_knn1,param_distributions = distribution_knn1_param, cv =kfold_knn1,  verbose = 2, n_jobs = -1, n_iter = 5)


In [8]:
kfold_knn2=StratifiedKFold(n_splits=2, shuffle = True)
pipe_knn2= Pipeline([('clf_knn', KNeighborsClassifier())])
distribution_knn2_param={'clf_knn__n_neighbors': sp_randint(1,10),'clf_knn__algorithm': ['auto'] }
rs_knn2 =RandomizedSearchCV(estimator= pipe_knn2,param_distributions = distribution_knn2_param, cv =kfold_knn2,  verbose = 2, n_jobs = -1, n_iter = 5)


GAUSSIAN CLASSIFIER

In [9]:
kfold_gaussian1=StratifiedKFold(n_splits=5, shuffle = True)
pipe_gaussian1= Pipeline([('clf_nb', GaussianNB())])
distribution_gaussian1_param={'clf_nb__var_smoothing': np.logspace(-5, 0, 5) }
rs_gaussian1 =RandomizedSearchCV(estimator= pipe_gaussian1,param_distributions = distribution_gaussian1_param, cv =kfold_gaussian1,  verbose = 2, n_jobs = -1, n_iter = 5)


RANDOMFOREST CLASSIFIER

COMPARISON

In [10]:
## # validation + comparaison
scoring = {'acc' : 'accuracy'}
scores = dict()
test_acc_coef = 10
fit_time_coef = 3

# GAUSSIAN1
scores['gaussian1'] = {}
scores['gaussian1']['test_acc'] = []
scores['gaussian1']['fit_time'] = []

scores_gaussian1 = cross_validate(rs_gaussian1,X_train_t1, y_train, cv=5, scoring = scoring)
test_acc_gaussian1 = np.mean(scores_gaussian1['test_acc'])
fit_time_gaussian1 = np.mean(scores_gaussian1['fit_time'])

scores['gaussian1']['test_acc'] = scores_gaussian1['test_acc']
scores['gaussian1']['fit_time'] = scores_gaussian1['fit_time']

# KNN1
scores['knn1'] = {}
scores['knn1']['test_acc'] = []
scores['knn1']['fit_time'] = []

scores_knn1 = cross_validate(rs_knn1,X_train_t2, y_train, cv=5, scoring = scoring)
test_acc_knn1 = np.mean(scores_knn1['test_acc'])
fit_time_knn1 = np.mean(scores_knn1['fit_time'])

scores['knn1']['test_acc'] = scores_knn1['test_acc']
scores['knn1']['fit_time'] = scores_knn1['fit_time']

# COMPUTE GLOBAL SCORE
models_scores = {}
models_scores['gaussian1'] = 1 * (test_acc_gaussian1 * test_acc_coef) * (fit_time_coef / (1+fit_time_gaussian1))
models_scores['knn1'] = 1 * (test_acc_knn1 * test_acc_coef) * (fit_time_coef / (1+fit_time_knn1))

# WINNER MODEL
winner_model = max(models_scores.items(), key=operator.itemgetter(1))[0]
print('winner model :',winner_model)


# COMPARISON CHART
plt.scatter(scores['gaussian1']['test_acc'], scores['gaussian1']['fit_time'], marker='x', color='#c7a23d', label='gaussian1')
plt.scatter(scores['knn1']['test_acc'], scores['knn1']['fit_time'], marker='x', color='#71fdcf', label='knn1')
plt.title('MODELS COMPARISON')
plt.xlabel('accuracy')
plt.ylabel('fit_time(sec)')
plt.legend(loc='lower right', bbox_to_anchor=(1.3,0))
plt.show()


# COMPARISON TABLE
models_scores['gaussian1'] = np.round(models_scores['gaussian1'],3)
models_scores['knn1'] = np.round(models_scores['knn1'],3)
scores['gaussian1']['test_acc'] = np.round(np.average(scores['gaussian1']['test_acc']),3)
scores['knn1']['test_acc'] = np.round(np.average(scores['knn1']['test_acc']),3)
scores['gaussian1']['fit_time'] = np.round(np.average(scores['gaussian1']['fit_time']),3)
scores['knn1']['fit_time'] = np.round(np.average(scores['knn1']['fit_time']),3)

fig, ax = plt.subplots()
fig.patch.set_visible(False)
ax.axis('off')
ax.axis('tight')
df = pd.DataFrame([['test_acc', scores['gaussian1']['test_acc'],scores['knn1']['test_acc']],['fit_time', scores['gaussian1']['fit_time'],scores['knn1']['fit_time']],['total', models_scores['gaussian1'], models_scores['knn1'],]],columns=['metric','gaussian1','knn1'])
ax.table(cellText=df.values, colLabels=df.columns, loc='center')
fig.tight_layout()
plt.show()
