In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
data= pd.read_csv("dummy_churn.csv", index_col=0)
data2= data[:]
data[:5]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1,1,0,1,0,1,0,0,0,1,0,...,29.85,29.85,0,0,0,0,0,0,1,0
2,0,0,0,0,34,1,0,1,0,1,...,56.95,1889.5,0,0,0,1,0,0,0,1
3,0,0,0,0,2,1,0,1,1,0,...,53.85,108.15,1,0,0,0,0,0,0,1
4,0,0,0,0,45,0,0,1,0,1,...,42.3,1840.75,0,0,0,1,0,0,0,0
5,1,0,0,0,2,1,0,0,0,0,...,70.7,151.65,1,1,0,0,0,0,1,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 1 to 7043
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   MultipleLines                          7043 non-null   int64  
 7   OnlineSecurity                         7043 non-null   int64  
 8   OnlineBackup                           7043 non-null   int64  
 9   DeviceProtection                       7043 non-null   int64  
 10  TechSupport                            7043 non-null   int64  
 11  Streaming

In [4]:
data.Churn.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [5]:
cols_to_scale= ["tenure", "MonthlyCharges", "TotalCharges"]

scaler= MinMaxScaler()
data[cols_to_scale]= scaler.fit_transform(data[cols_to_scale])
data[:5]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1,1,0,1,0,0.013889,0,0,0,1,0,...,0.115423,0.001275,0,0,0,0,0,0,1,0
2,0,0,0,0,0.472222,1,0,1,0,1,...,0.385075,0.215867,0,0,0,1,0,0,0,1
3,0,0,0,0,0.027778,1,0,1,1,0,...,0.354229,0.01031,1,0,0,0,0,0,0,1
4,0,0,0,0,0.625,0,0,1,0,1,...,0.239303,0.210241,0,0,0,1,0,0,0,0
5,1,0,0,0,0.027778,1,0,0,0,0,...,0.521891,0.01533,1,1,0,0,0,0,1,0


In [6]:
X= data.drop("Churn", axis= "columns")
y= data.Churn

In [7]:
X_res, y_res= SMOTE().fit_resample(X, y)

In [8]:
y_res.value_counts()

Churn
0    5174
1    5174
Name: count, dtype: int64

# CROSS_VAL_SCORE

In [9]:
scores= cross_val_score(SVC(kernel="linear", C=10, gamma='auto'), X_res, y_res, cv=5)
scores.mean()

0.7933944144409188

In [10]:
scores= cross_val_score(SVC(kernel="rbf", C=10, gamma='auto'), X_res, y_res, cv=5)
scores.mean()

0.7954235400424485

In [11]:
scores= cross_val_score(SVC(kernel="rbf", C=20, gamma='auto'), X_res, y_res, cv=5)
scores.mean()

0.8020917944443277

# Grid Search CV

In [12]:
clf= GridSearchCV(SVC(gamma="auto"),
                 {'C': [20, 30, 50, 100],
                 'kernel': ['rbf', 'linear']},
                 cv=5,
                 return_train_score=False)
clf.fit(X_res, y_res)
clf.cv_results_

{'mean_fit_time': array([ 2.59738784,  4.8953156 ,  2.85267324,  6.24430075,  3.11823745,
         8.5296092 ,  3.80556045, 15.31815825]),
 'std_fit_time': array([0.10328711, 0.36725531, 0.17457983, 0.4247025 , 0.22365311,
        0.51677472, 0.31374877, 2.29015808]),
 'mean_score_time': array([0.78134532, 0.19394374, 0.78913312, 0.18637981, 0.78618073,
        0.18243165, 0.74484477, 0.21480432]),
 'std_score_time': array([0.01574719, 0.01203489, 0.03423474, 0.01743643, 0.04125413,
        0.00952118, 0.03700705, 0.03325214]),
 'param_C': masked_array(data=[20, 20, 30, 30, 50, 50, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C':

In [13]:
df= pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.597388,0.103287,0.781345,0.015747,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.769565,0.7657,0.825121,0.823586,0.826486,0.802092,0.028177,4
1,4.895316,0.367255,0.193944,0.012035,20,linear,"{'C': 20, 'kernel': 'linear'}",0.75314,0.758937,0.822705,0.81827,0.81392,0.793394,0.030682,8
2,2.852673,0.17458,0.789133,0.034235,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.764251,0.762319,0.830435,0.82552,0.833736,0.803252,0.032743,3
3,6.244301,0.424702,0.18638,0.017436,30,linear,"{'C': 30, 'kernel': 'linear'}",0.75314,0.757971,0.824155,0.81972,0.814403,0.793878,0.031479,7
4,3.118237,0.223653,0.786181,0.041254,50,rbf,"{'C': 50, 'kernel': 'rbf'}",0.764251,0.76715,0.833816,0.829386,0.836636,0.806248,0.0332,2
5,8.529609,0.516775,0.182432,0.009521,50,linear,"{'C': 50, 'kernel': 'linear'}",0.751208,0.757971,0.824155,0.82262,0.814886,0.794168,0.032538,6
6,3.80556,0.313749,0.744845,0.037007,100,rbf,"{'C': 100, 'kernel': 'rbf'}",0.763285,0.768116,0.835749,0.836153,0.842436,0.809148,0.035586,1
7,15.318158,2.290158,0.214804,0.033252,100,linear,"{'C': 100, 'kernel': 'linear'}",0.750725,0.756039,0.824638,0.82262,0.817786,0.794361,0.033576,5


In [14]:
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.802092
1,20,linear,0.793394
2,30,rbf,0.803252
3,30,linear,0.793878
4,50,rbf,0.806248
5,50,linear,0.794168
6,100,rbf,0.809148
7,100,linear,0.794361


In [15]:
#dir(clf)

In [16]:
clf.best_score_

0.8091476897285206

In [17]:
clf.best_params_

{'C': 100, 'kernel': 'rbf'}

In [18]:
rs=RandomizedSearchCV(SVC(gamma="auto"),
            {'C': [10, 50, 100, 150, 200, 250, 300],
             'kernel': ['rbf', 'linear']},
            cv=5,
            return_train_score=False,
            n_iter=3)

In [19]:
rs.fit(X_res, y_res)
df2= pd.DataFrame(rs.cv_results_)
df2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,28.35512,2.472302,0.230494,0.014126,linear,200,"{'kernel': 'linear', 'C': 200}",0.750242,0.756039,0.824638,0.823103,0.817786,0.794361,0.033784,2
1,10.583682,0.972888,0.273923,0.033289,linear,50,"{'kernel': 'linear', 'C': 50}",0.751208,0.757971,0.824155,0.82262,0.814886,0.794168,0.032538,3
2,6.397333,0.373748,1.029291,0.094923,rbf,200,"{'kernel': 'rbf', 'C': 200}",0.764251,0.764251,0.835749,0.838569,0.843403,0.809245,0.036818,1


In [20]:
df2[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,200,linear,0.794361
1,50,linear,0.794168
2,200,rbf,0.809245


In [21]:
rs.best_score_

0.80924463497267

In [22]:
rs.best_params_

{'kernel': 'rbf', 'C': 200}

In [23]:
model_params= {
    "svm": { "model": SVC(gamma="auto"),
            "params": {"C": [50, 100, 150, 200],
                      "kernel": ["rbf", "linear"]},
           },
    "random_forest": {"model": RandomForestClassifier(),
                      "params": {"n_estimators": [1,5,10]}
    },
    "logistic_regression": {"model": LogisticRegression(solver='liblinear', 
                                                        multi_class="auto"),
                           "params": {"C": [1, 5, 10]}
                           }
}

In [24]:
scores= []

for model_name, mp in model_params.items():
    grid= GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    grid.fit(X_res, y_res)
    scores.append({
        'model': model_name,
        'best_score': grid.best_score_,
        'best_params': grid.best_params_
    })

In [25]:
pd.DataFrame(scores, columns=['model','best_score','best_params'])

Unnamed: 0,model,best_score,best_params
0,svm,0.809245,"{'C': 150, 'kernel': 'rbf'}"
1,random_forest,0.822772,{'n_estimators': 10}
2,logistic_regression,0.794168,{'C': 10}


In [26]:
score= cross_val_score(RandomForestClassifier(n_estimators=50, max_depth=50), X_res, y_res, cv=5)
score.mean()

0.8344655753321986

In [27]:
score= cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=20), X_res, y_res, cv=5)
score.mean()

0.8372689553402773

In [28]:
score= cross_val_score(RandomForestClassifier(n_estimators=150,  max_depth=30, criterion='gini'), X_res, y_res, cv=5)
score.mean()

0.8380416687097083

In [29]:
score= cross_val_score(RandomForestClassifier(n_estimators=150, max_depth=30), X_res, y_res, cv=10)
score.mean()

0.8426814864650203

In [30]:
score= cross_val_score(RandomForestClassifier(n_estimators=200, max_depth=10), X_res, y_res, cv=5)
score.mean()

0.8115614675343172

In [31]:
score= cross_val_score(RandomForestClassifier(n_estimators=50, max_depth=None), X_res, y_res, cv=5)
score.mean()

0.8352388490787632

In [32]:
score= cross_val_score(RandomForestClassifier(n_estimators=300, max_depth=50), X_res, y_res, cv=5)
score.mean()

0.8381380535767239

In [33]:
score= cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=50), X_res, y_res, cv=5)
score.mean()

0.8379439762960473

In [34]:
score= cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=10), X_res, y_res, cv=5)
score.mean()

0.8124317332231259