# Classification Model Selection

# For Large Datasets

In [1]:
# Required Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [2]:
# importing the dataset
ds = pd.read_csv("Social_Network_Ads.csv", nrows=200)


# Data Preprocessing

In [None]:
# preprocessing if required


In [3]:
# splitting into dependent and independent variables
X = ds.iloc[:, 2:4].values
y = ds.iloc[:, -1].values
X

array([[    19,  19000],
       [    35,  20000],
       [    26,  43000],
       [    27,  57000],
       [    19,  76000],
       [    27,  58000],
       [    27,  84000],
       [    32, 150000],
       [    25,  33000],
       [    35,  65000],
       [    26,  80000],
       [    26,  52000],
       [    20,  86000],
       [    32,  18000],
       [    18,  82000],
       [    29,  80000],
       [    47,  25000],
       [    45,  26000],
       [    46,  28000],
       [    48,  29000],
       [    45,  22000],
       [    47,  49000],
       [    48,  41000],
       [    45,  22000],
       [    46,  23000],
       [    47,  20000],
       [    49,  28000],
       [    47,  30000],
       [    29,  43000],
       [    31,  18000],
       [    31,  74000],
       [    27, 137000],
       [    21,  16000],
       [    28,  44000],
       [    27,  90000],
       [    35,  27000],
       [    33,  28000],
       [    30,  49000],
       [    26,  72000],
       [    27,  31000],


In [4]:
# y dataset
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [5]:
# Feature scaling
sc = StandardScaler()
SX = sc.fit_transform(X)
SX

array([[-1.52370992, -1.34479807],
       [ 0.62804284, -1.31234638],
       [-0.58231809, -0.56595749],
       [-0.44783354, -0.11163382],
       [-1.52370992,  0.50494831],
       [-0.44783354, -0.07918213],
       [-0.44783354,  0.76456184],
       [ 0.22458919,  2.90637344],
       [-0.71680264, -0.8904744 ],
       [ 0.62804284,  0.14797971],
       [-0.58231809,  0.63475507],
       [-0.58231809, -0.27389227],
       [-1.38922537,  0.82946522],
       [ 0.22458919, -1.37724976],
       [-1.65819447,  0.69965846],
       [-0.17886445,  0.63475507],
       [ 2.2418574 , -1.15008793],
       [ 1.97288831, -1.11763623],
       [ 2.10737285, -1.05273285],
       [ 2.37634195, -1.02028116],
       [ 1.97288831, -1.247443  ],
       [ 2.2418574 , -0.37124734],
       [ 2.37634195, -0.63086087],
       [ 1.97288831, -1.247443  ],
       [ 2.10737285, -1.21499131],
       [ 2.2418574 , -1.31234638],
       [ 2.5108265 , -1.05273285],
       [ 2.2418574 , -0.98782947],
       [-0.17886445,

# Without Feature Scaling

In [6]:
# Model Tuner
model_params = {
        "Gaussian_NB":{"model":GaussianNB(),
                        "params":{}
                        },
         "Multinomial_NB":{"model":MultinomialNB(),
                        "params":{}
                        },
         "Bernoulli_NB":{"model":BernoulliNB(),
                        "params":{}
                        },
         "DecisionTree_Classifier":{"model":DecisionTreeClassifier(),
                        "params":{"criterion":["gini", "entropy"]}
                        }
                } 
               
model_params_loop = {
        "svc":{"model":SVC(),
                "params":{"C":[1.5, 2.0 , 2.5, 3.5, 4],
                          "kernel":["rbf","poly","linear"]},
        },
        "Random_Forest":{"model":RandomForestClassifier(),
                        "params":{"n_estimators":[10, 25, 50, 75, 100, 150],
                                  "criterion":["gini","entropy"]}
                        },
        "Logistic_Regression":{"model":LogisticRegression(),
                               "params":{"C":[1.5, 2.0 , 2.5, 3.5, 4]}
                               },
        "K_NN":{"model":KNeighborsClassifier(),
                        "params":{"n_neighbors":[5, 7, 9, 11]}
                        }}

In [7]:
# To run model_params we need for loop.
score = []
for model_name, mp in model_params.items():
    classifier = RandomizedSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False, n_iter = 3)
    classifier.fit(X, y)
    score.append({"model":model_name,
                  "best_score":classifier.best_score_,
                  "best_params" : classifier.best_params_})

for _ in range(5):
    for model_name, mp in model_params_loop.items():
        classifier = RandomizedSearchCV(mp["model"], mp["params"],
                                cv = 5, return_train_score=False, n_iter = 3)
        classifier.fit(X, y)
        score.append({"model":model_name,
                    "best_score":classifier.best_score_,
                    "best_params" : classifier.best_params_})
    

df = pd.DataFrame(score, columns = ["model", "best_score", "best_params"])



#The Best Model Is:(Without Feature Scaling)

In [8]:

df[["model", "best_params","best_score"]][df.best_score == df.best_score.max()]


Unnamed: 0,model,best_params,best_score
17,Random_Forest,"{'n_estimators': 10, 'criterion': 'entropy'}",0.945


# Other model Performances

In [12]:
df.sort_values("best_score", ascending=False)

Unnamed: 0,model,best_score,best_params
17,Random_Forest,0.945,"{'n_estimators': 10, 'criterion': 'entropy'}"
9,Random_Forest,0.935,"{'n_estimators': 50, 'criterion': 'entropy'}"
21,Random_Forest,0.935,"{'n_estimators': 50, 'criterion': 'entropy'}"
5,Random_Forest,0.935,"{'n_estimators': 50, 'criterion': 'gini'}"
13,Random_Forest,0.935,"{'n_estimators': 75, 'criterion': 'entropy'}"
3,DecisionTree_Classifier,0.93,{'criterion': 'entropy'}
0,Gaussian_NB,0.905,{}
16,svc,0.895,"{'kernel': 'poly', 'C': 2.5}"
12,svc,0.895,"{'kernel': 'poly', 'C': 1.5}"
8,svc,0.895,"{'kernel': 'poly', 'C': 2.5}"


# self increasing model efficiency by tuning

In [None]:
#cross_val_score()

# With Feature Scaling


In [15]:
model_params_upd = {
        "Gaussian_NB":{"model":GaussianNB(),
                        "params":{}
                        },
        "Bernoulli_NB":{"model":BernoulliNB(),
                        "params":{}
                        },
        "DecisionTree_Classifier":{"model":DecisionTreeClassifier(),
                        "params":{"criterion":["gini", "entropy"]}
                        }
                } 

In [20]:
# To run model_params we need for loop.
score1 = []
for model_name, mp in model_params_upd.items():
    classifier1 = RandomizedSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False, n_iter = 3)
    classifier1.fit(SX, y)
    score1.append({"model":model_name,
                  "best_score":classifier1.best_score_,
                  "best_params" : classifier1.best_params_})
for _ in range(5):
    for model_name, mp in model_params_loop.items():
        classifier1 = RandomizedSearchCV(mp["model"], mp["params"],
                                cv = 5, return_train_score=False, n_iter = 3)
        classifier1.fit(SX, y)
        score1.append({"model":model_name,
                    "best_score":classifier1.best_score_,
                    "best_params" : classifier1.best_params_})


df1 = pd.DataFrame(score1, columns = ["model", "best_score", "best_params"])



# The Best Model is: (With Feature Scaling)

In [21]:

df1[["model", "best_params","best_score"]][df1.best_score == df1.best_score.max()]


Unnamed: 0,model,best_params,best_score
3,svc,"{'kernel': 'poly', 'C': 4}",0.945
7,svc,"{'kernel': 'poly', 'C': 2.0}",0.945
11,svc,"{'kernel': 'poly', 'C': 2.5}",0.945
15,svc,"{'kernel': 'poly', 'C': 4}",0.945
27,svc,"{'kernel': 'poly', 'C': 2.0}",0.945
31,svc,"{'kernel': 'poly', 'C': 1.5}",0.945
35,svc,"{'kernel': 'poly', 'C': 2.5}",0.945
39,svc,"{'kernel': 'poly', 'C': 2.5}",0.945


# Other Model performances!

In [19]:
df1.sort_values("best_score", ascending = False)

Unnamed: 0,model,best_score,best_params
3,svc,0.945,"{'kernel': 'poly', 'C': 1.5}"
15,svc,0.945,"{'kernel': 'poly', 'C': 3.5}"
7,svc,0.945,"{'kernel': 'poly', 'C': 4}"
22,K_NN,0.94,{'n_neighbors': 11}
6,K_NN,0.94,{'n_neighbors': 11}
10,K_NN,0.935,{'n_neighbors': 9}
18,K_NN,0.935,{'n_neighbors': 9}
14,K_NN,0.935,{'n_neighbors': 9}
11,svc,0.935,"{'kernel': 'rbf', 'C': 4}"
4,Random_Forest,0.935,"{'n_estimators': 50, 'criterion': 'entropy'}"


# self increasing model efficiency by tuning


In [None]:
# cross_val_score()