<a href="https://colab.research.google.com/github/aleksanderprofic/Machine-Learning/blob/master/Classification/ModelSelection/social_network_ads_model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification model selection

### Selecting the best model for particular problem out of all learned classification models:
* Logistic Regression, 
* KNN, 
* SVM,
* Naive Bayes
* Decision Trees,
* Random Forests


## Data preprocessing

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


### Extracting dependent and independent variables

In [2]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Feature Scaling

In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

### Splitting dataset into the Training Set and the Test Set 

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Training and predictions

### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score

log_regressor = LogisticRegression()
log_regressor.fit(X_train, y_train)
log_y_pred = log_regressor.predict(X_test)

cm = confusion_matrix(y_test, log_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, log_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, log_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 84.00%
Recall score: 68.42%
Confusion matrix: 
[[58  4]
 [12 26]]


#### Applying k-fold Cross Validation

In [6]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=log_regressor, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 84.67%
Standard deviation: 4.99%


### KNN

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, knn_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, knn_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, knn_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 89.00%
Recall score: 84.21%
Confusion matrix: 
[[57  5]
 [ 6 32]]


#### Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=knn, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 92.00%
Standard deviation: 3.40%


### SVM

In [9]:
from sklearn.svm import SVC

svc = SVC(degree=4)
svc.fit(X_train, y_train)
svc_y_pred = svc.predict(X_test)

cm = confusion_matrix(y_test, svc_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, svc_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, svc_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 91.00%
Recall score: 89.47%
Confusion matrix: 
[[57  5]
 [ 4 34]]


#### Applying k-fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=svc, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 90.67%
Standard deviation: 4.16%


### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_y_pred = nb.predict(X_test)

cm = confusion_matrix(y_test, nb_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, nb_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, nb_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 88.00%
Recall score: 78.95%
Confusion matrix: 
[[58  4]
 [ 8 30]]


#### Applying k-fold Cross Validation

In [12]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=nb, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 88.33%
Standard deviation: 5.22%


### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_y_pred = tree.predict(X_test)

cm = confusion_matrix(y_test, tree_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, tree_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, tree_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 90.00%
Recall score: 81.58%
Confusion matrix: 
[[59  3]
 [ 7 31]]


#### Applying k-fold Cross Validation

In [14]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=tree, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 87.00%
Standard deviation: 4.33%


### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=10)
forest.fit(X_train, y_train)
forest_y_pred = forest.predict(X_test)

cm = confusion_matrix(y_test, forest_y_pred)

print('Accuracy score: {:.2f}%'.format(accuracy_score(y_test, forest_y_pred) * 100))
print('Recall score: {:.2f}%'.format(recall_score(y_test, forest_y_pred) * 100))
print(f'Confusion matrix: \n{cm}')

Accuracy score: 89.00%
Recall score: 81.58%
Confusion matrix: 
[[58  4]
 [ 7 31]]


#### Applying k-fold Cross Validation

In [16]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=forest, X=X_train, y=y_train, cv=10)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 88.67%
Standard deviation: 3.06%


##### Running above code several times showed that KNearestNeighbors and Support Vector Machines seem to be the best classification models for this problem because they are consistantly reaching the highest accuracy and recall scores.