<a href="https://colab.research.google.com/github/aleksanderprofic/Machine-Learning/blob/master/Classification/ModelSelection/social_network_ads_model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification model selection

### Selecting the best model for particular problem out of all learned classification models:
* Logistic Regression, 
* K-Nearest Neighbors, 
* Support Vector Machines,
* Naive Bayes
* Decision Trees,
* Random Forests


## Data preprocessing

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


### Extracting dependent and independent variables

In [2]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Feature Scaling

In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

### Splitting dataset into the Training Set and the Test Set 

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Training and predictions

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score

log_regressor = LogisticRegression()
log_regressor.fit(X_train, y_train)
log_y_pred = log_regressor.predict(X_test)

cm = confusion_matrix(y_test, log_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, log_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, log_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 91.00%
Recall score: 82.35%
Confusion matrix: 
[[63  3]
 [ 6 28]]


#### Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=log_regressor, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 82.33%
Standard deviation: 5.78%


### K-Nearest Neighbors

#### Performing Grid Search to find the best hyper parameter

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

parameters = [{'n_neighbors': [3,4,5,6,7,10]}]
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best mean accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best parameter: {grid_search.best_params_}')

Best mean accuracy: 89.00%
Standard deviation: 3.96%
Best parameter: {'n_neighbors': 3}


In [17]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, knn_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, knn_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, knn_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 94.00%
Recall score: 91.18%
Confusion matrix: 
[[63  3]
 [ 3 31]]


### Support Vector Machines

#### Performing Grid Search to find the best hyper parameters

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [0, 0.1, 0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 'scale']}]
grid_search = GridSearchCV(estimator=SVC(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best mean accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best parameter: {grid_search.best_params_}')

Best mean accuracy: 89.67%
Standard deviation: 3.79%
Best parameter: {'C': 1, 'gamma': 0.4, 'kernel': 'rbf'}


In [20]:
svc = SVC(C=1.0, kernel='rbf', gamma=0.4)
svc.fit(X_train, y_train)
svc_y_pred = svc.predict(X_test)

cm = confusion_matrix(y_test, svc_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, svc_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, svc_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 96.00%
Recall score: 94.12%
Confusion matrix: 
[[64  2]
 [ 2 32]]


### Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_y_pred = nb.predict(X_test)

cm = confusion_matrix(y_test, nb_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, nb_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, nb_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 94.00%
Recall score: 91.18%
Confusion matrix: 
[[63  3]
 [ 3 31]]


#### Applying k-fold Cross Validation

In [23]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=nb, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 87.00%
Standard deviation: 5.26%


### Decision Tree

#### Performing Grid Search to find the best hyper parameter

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

parameters = [{'criterion': ['gini', 'entropy']}]
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best mean accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best parameter: {grid_search.best_params_}')

Best mean accuracy: 86.00%
Standard deviation: 5.33%
Best parameter: {'criterion': 'entropy'}


In [26]:
tree = DecisionTreeClassifier(criterion='entropy')
tree.fit(X_train, y_train)
tree_y_pred = tree.predict(X_test)

cm = confusion_matrix(y_test, tree_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, tree_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, tree_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 89.00%
Recall score: 73.53%
Confusion matrix: 
[[64  2]
 [ 9 25]]


### Random Forest

#### Performing Grid Search to find the best hyper parameters

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

parameters = [{'n_estimators': [5, 10, 20, 25, 50, 75, 100, 125, 150, 175, 200], 'criterion': ['gini', 'entropy']}]
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best mean accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best parameter: {grid_search.best_params_}')

Best mean accuracy: 89.33%
Standard deviation: 4.42%
Best parameter: {'criterion': 'entropy', 'n_estimators': 25}


In [30]:
forest = RandomForestClassifier(criterion='entropy', n_estimators=25)
forest.fit(X_train, y_train)
forest_y_pred = forest.predict(X_test)

cm = confusion_matrix(y_test, forest_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, forest_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, forest_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 92.00%
Recall score: 85.29%
Confusion matrix: 
[[63  3]
 [ 5 29]]


The best models for this problem are:
1. Support Vector Machines
  - Accuracy: 96%
  - Recall: 94.12%
2. KNearestNeighbors
  - Accuracy: 94%
  - Recall: 91.18%
  - Better training set results than 3rd model
3. Naive Bayes
  - Accuracy: 94%
  - Recall: 91.18%
