In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("suv_data.csv")

In [3]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data = data.drop('User ID',1)
gender = pd.get_dummies(columns= ['Gender'], data = data, drop_first = True)

In [5]:
gender.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [6]:
data.isna().sum()

Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = gender.drop('Purchased',1)
y = gender.Purchased
X = scale.fit_transform(X)
X

array([[-1.78179743, -1.49004624,  1.02020406],
       [-0.25358736, -1.46068138,  1.02020406],
       [-1.11320552, -0.78528968, -0.98019606],
       ...,
       [ 1.17910958, -1.46068138, -0.98019606],
       [-0.15807423, -1.07893824,  1.02020406],
       [ 1.08359645, -0.99084367, -0.98019606]])

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [9]:
cross_val_score(LogisticRegression(), X,y)

array([0.7   , 0.95  , 0.9375, 0.8125, 0.7   ])

In [10]:
print('Logistic AVerage Score = ', np.mean(cross_val_score(LogisticRegression(), X,y)))

Logistic AVerage Score =  0.82


In [11]:
cross_val_score(SVC(kernel = 'linear', C = 1), X,y)

array([0.6875, 0.9625, 0.9   , 0.775 , 0.7125])

In [12]:
print('SVC AVerage Score = ', np.mean(cross_val_score(SVC(kernel = 'linear', C = 1), X,y)))

SVC AVerage Score =  0.8074999999999999


In [13]:
r = cross_val_score(RandomForestClassifier(), X,y)
print(r)
print('Random forest Average Score =', np.mean(r))

[0.8    0.925  0.875  0.8375 0.8   ]
Random forest Average Score = 0.8474999999999999


In [14]:
k = cross_val_score(KNeighborsClassifier(n_neighbors = 3), X,y)
print(k)
print('KNN  Average Score =', np.mean(k))

[0.9125 0.9125 0.8875 0.8    0.8125]
KNN  Average Score = 0.865


In [15]:
d = cross_val_score(DecisionTreeClassifier(), X,y)
print(d)
print('Decision Tree Average Score =', np.mean(d))

[0.75   0.9    0.825  0.775  0.7875]
Decision Tree Average Score = 0.8074999999999999


In [16]:
from sklearn.model_selection import GridSearchCV

# using GridSearchCv

In [17]:
# for svc
model = GridSearchCV(SVC(gamma = 'auto'), {
    'C': [1,10,20],
    "kernel": ['rbf', 'linear'],
    
},cv=5, return_train_score = False)
                     
model.fit(X,y)
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003801,0.000749,0.002001,0.0006332397,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.9125,0.9375,0.9125,0.8125,0.925,0.9,0.044721,1
1,0.003401,0.00049,0.0008,0.0004000905,1,linear,"{'C': 1, 'kernel': 'linear'}",0.6875,0.9625,0.9,0.775,0.7125,0.8075,0.106829,4
2,0.003601,0.00049,0.0014,0.0004902129,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.8875,0.9375,0.9125,0.825,0.8875,0.89,0.037417,2
3,0.004802,0.0004,0.001001,5.560829e-07,10,linear,"{'C': 10, 'kernel': 'linear'}",0.6875,0.9625,0.9,0.775,0.7125,0.8075,0.106829,4
4,0.004201,0.0004,0.001401,0.0008000374,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.875,0.9375,0.9,0.825,0.8875,0.885,0.036572,3
5,0.009403,0.002578,0.001401,0.0004902323,20,linear,"{'C': 20, 'kernel': 'linear'}",0.6875,0.9625,0.9,0.775,0.7125,0.8075,0.106829,4


In [18]:
model.best_params_, model.best_score_

({'C': 1, 'kernel': 'rbf'}, 0.9)

## selecting best model and best parameter

In [19]:
classifier = {
    'SVC': { 'model' : SVC(gamma = 'auto'),
           'parameter': { 'C':[1,5,8],
                        'kernel': ['linear','rbf']}},
    'Logmodel': { 'model' :LogisticRegression(),
                'parameter': {'C': [1,2,5]}},
    
    "RFc" : { 'model': RandomForestClassifier(),
            'parameter': {'n_estimators' : [5,10,15]}},
    
    "KNN" : {'model': KNeighborsClassifier(),
            'parameter': {'n_neighbors': [2,3,5,10]}},
    
    'DTC': {'model': DecisionTreeClassifier(),
           'parameter': {'criterion' :['gini', 'entropy']}}
}

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
score = []
for model_name, param in classifier.items():
    clf =GridSearchCV(param['model'], param['parameter'], cv = 5)
    clf.fit(X,y)
    score.append({
    "model": model_name,
    'parameter': clf.best_params_,
    'best result': clf.best_score_ 
    })
pd.DataFrame(score)    

Unnamed: 0,model,parameter,best result
0,SVC,"{'C': 1, 'kernel': 'rbf'}",0.9
1,Logmodel,{'C': 1},0.82
2,RFc,{'n_estimators': 15},0.87
3,KNN,{'n_neighbors': 10},0.9075
4,DTC,{'criterion': 'gini'},0.81


In [22]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

# The best Parameters are SVC('C': 1, 'kernel': 'rbf) and KNN with K = 10. 
SVC will be better because the value of K is much