# Classification Model Selection

# For Small Datasets

In [7]:
# Required Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [8]:
# importing the dataset
ds = pd.read_csv(".csv", nrows=15)


# Data Preprocessing

In [9]:
# preprocessing if required


In [10]:
# splitting into dependent and independent variables
X = ds.iloc[:, :-1].values
y = ds.iloc[:, -1].values
X

array([[    19,  19000],
       [    35,  20000],
       [    26,  43000],
       [    27,  57000],
       [    19,  76000],
       [    27,  58000],
       [    27,  84000],
       [    32, 150000],
       [    25,  33000],
       [    35,  65000],
       [    26,  80000],
       [    26,  52000],
       [    20,  86000],
       [    32,  18000],
       [    18,  82000]])

In [None]:
# y dataset
y

In [12]:
# Feature scaling
sc = StandardScaler()
SX = sc.fit_transform(X)
SX

array([[-1.34948954, -1.27772353],
       [ 1.62186357, -1.24768301],
       [-0.04952255, -0.55675101],
       [ 0.13618702, -0.1361837 ],
       [-1.34948954,  0.43458622],
       [ 0.13618702, -0.10614318],
       [ 0.13618702,  0.67491039],
       [ 1.06473486,  2.65758484],
       [-0.23523212, -0.85715623],
       [ 1.62186357,  0.10414048],
       [-0.04952255,  0.55474831],
       [-0.04952255, -0.28638631],
       [-1.16377997,  0.73499144],
       [ 1.06473486, -1.30776406],
       [-1.53519911,  0.61482935]])

# Without Feature Scaling

In [13]:
# Model Tuner
model_params = {
        "svc":{"model":SVC(),
                "params":{"C":[1, 5, 10],
                          "kernel":["rbf","poly","linear"]},
                          
            },
        "Random_Forest":{"model":RandomForestClassifier(),
                        "params":{"n_estimators":[10,50,100],
                                  "criterion":["gini","entropy"]}
                        },
        "Logistic_Regression":{"model":LogisticRegression(),
                               "params":{"C":[1,5,10]}
                               },
        "Gaussian_NB":{"model":GaussianNB(),
                        "params":{}
                        },
         "Multinomial_NB":{"model":MultinomialNB(),
                        "params":{}
                        },
         "Bernoulli_NB":{"model":BernoulliNB(),
                        "params":{}
                        },
         "K_NN":{"model":KNeighborsClassifier(),
                        "params":{"n_neighbors":[5, 7, 9]}
                        },
         "DecisionTree_Classifier":{"model":DecisionTreeClassifier(),
                        "params":{"criterion":["gini", "entropy"]}
                        }
                } 
               


In [14]:
# To run model_params we need for loop.
score = []
for model_name, mp in model_params.items():
    classifier = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier.fit(X, y)
    score.append({"model":model_name,
                  "best_score":classifier.best_score_,
                  "best_params" : classifier.best_params_})
    
df = pd.DataFrame(score, columns = ["model", "best_score", "best_params"])

#The Best Model Is:(Without Feature Scaling)

In [15]:

df[["model", "best_params","best_score"]][df.best_score == df.best_score.max()]


Unnamed: 0,model,best_params,best_score
0,svc,"{'C': 1, 'kernel': 'rbf'}",0.666667
5,Bernoulli_NB,{},0.666667
6,K_NN,{'n_neighbors': 7},0.666667


# Other model Performances

In [16]:
df

Unnamed: 0,model,best_score,best_params
0,svc,0.666667,"{'C': 1, 'kernel': 'rbf'}"
1,Random_Forest,0.6,"{'criterion': 'gini', 'n_estimators': 10}"
2,Logistic_Regression,0.6,{'C': 1}
3,Gaussian_NB,0.533333,{}
4,Multinomial_NB,0.466667,{}
5,Bernoulli_NB,0.666667,{}
6,K_NN,0.666667,{'n_neighbors': 7}
7,DecisionTree_Classifier,0.4,{'criterion': 'gini'}


# With Feature Scaling


In [17]:
# Model Tuner
model_params1 = {
        "svc":{"model":SVC(),
                "params":{"C":[1, 10, 20],
                          "kernel":["rbf","poly","linear"]},
                          
            },
        "Random_Forest":{"model":RandomForestClassifier(),
                        "params":{"n_estimators":[10,20,30],
                                  "criterion":["gini","entropy"]}
                        },
        "Logistic_Regression":{"model":LogisticRegression(),
                               "params":{"C":[1,5,10]}
                               },
        "Gaussian_NB":{"model":GaussianNB(),
                        "params":{}
                        },
         "Bernoulli_NB":{"model":BernoulliNB(),
                        "params":{}
                        },
         "K_NN":{"model":KNeighborsClassifier(),
                        "params":{"n_neighbors":[5,7]}
                        },
         "DecisionTree_Classifier":{"model":DecisionTreeClassifier(),
                        "params":{"criterion":["gini", "entropy"]}
                        }
                } 
               


In [27]:
# To run model_params we need for loop.
score1 = []
for model_name, mp in model_params1.items():
    classifier1 = GridSearchCV(mp["model"], mp["params"],
                              cv = 5, return_train_score=False)
    classifier1.fit(SX, y)
    score1.append({"model":model_name,
                  "best_score":classifier1.best_score_,
                  "best_params" : classifier1.best_params_})
    
df1 = pd.DataFrame(score1, columns = ["model", "best_score", "best_params"])

# The Best Model is: (With Feature Scaling)

In [28]:

df1[["model", "best_params","best_score"]][df1.best_score == df1.best_score.max()]


Unnamed: 0,model,best_params,best_score
0,svc,"{'C': 1, 'kernel': 'rbf'}",0.666667
4,Bernoulli_NB,{},0.666667
5,K_NN,{'n_neighbors': 7},0.666667


# Other Model performances!

In [29]:
df1

Unnamed: 0,model,best_score,best_params
0,svc,0.666667,"{'C': 1, 'kernel': 'rbf'}"
1,Random_Forest,0.533333,"{'criterion': 'gini', 'n_estimators': 10}"
2,Logistic_Regression,0.6,{'C': 1}
3,Gaussian_NB,0.533333,{}
4,Bernoulli_NB,0.666667,{}
5,K_NN,0.666667,{'n_neighbors': 7}
6,DecisionTree_Classifier,0.4,{'criterion': 'gini'}
