In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection

In [2]:
df=pd.read_csv("churn_train.csv")

In [3]:
df[ :5]

Unnamed: 0,st,acclen,arcode,phnum,intplan,voice,nummailmes,tdmin,tdcal,tdchar,...,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc,label
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [4]:
df["label"]=(df["label"]=="True.").astype(np.int) ### Changing label to 0 and 1
df["intplan"].replace({"yes":1,"no":0},inplace=True)
df["voice"].replace({"yes":1,"no":0},inplace=True)

In [6]:
df1=pd.DataFrame(df)

In [7]:
df1[ :2]

Unnamed: 0,st,acclen,arcode,phnum,intplan,voice,nummailmes,tdmin,tdcal,tdchar,...,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc,label
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0


In [8]:
type(df1)

pandas.core.frame.DataFrame

In [9]:
label_encoder = preprocessing.LabelEncoder()
df1["st"] = label_encoder.fit_transform(df1["st"])
print (df1.dtypes)

st              int32
acclen          int64
arcode          int64
phnum          object
intplan         int64
voice           int64
nummailmes      int64
tdmin         float64
tdcal           int64
tdchar        float64
temin         float64
tecal           int64
tecahr        float64
tnmin         float64
tncal           int64
tnchar        float64
timin         float64
tical           int64
tichar        float64
ncsc            int64
label           int32
dtype: object


In [10]:
X=df1.drop({"phnum","label"}, axis=1)

In [11]:
y=df1["label"]

In [12]:
# Splitting into train and test data
Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X,y,test_size=.2,random_state=42)
Xtrain.shape

(2666, 19)

In [13]:
Xtrain[ :5]

Unnamed: 0,st,acclen,arcode,intplan,voice,nummailmes,tdmin,tdcal,tdchar,temin,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc
817,44,243,510,0,0,0,95.5,92,16.24,163.7,63,13.91,264.2,118,11.89,6.6,6,1.78,2
1373,40,108,415,0,0,0,112.0,105,19.04,193.7,110,16.46,208.9,93,9.4,4.1,4,1.11,4
679,43,75,415,1,0,0,222.4,78,37.81,327.0,111,27.8,208.0,104,9.36,8.7,9,2.35,1
56,5,141,415,0,0,0,126.9,98,21.57,180.0,62,15.3,140.8,128,6.34,8.0,2,2.16,1
1993,15,86,510,0,0,0,216.3,96,36.77,266.3,77,22.64,214.0,110,9.63,4.5,3,1.22,0


In [14]:
def printscores(actual,pred):
    print("AUC:",metrics.roc_auc_score(actual,pred))
    print("accuracy:",metrics.accuracy_score(actual,pred))
    print("recall  :",metrics.recall_score(actual,pred))
    print("precision :",metrics.precision_score(actual,pred))
    print("f1-score : ",metrics.f1_score(actual,pred))

In [15]:
from sklearn import tree
model=tree.DecisionTreeClassifier(random_state=42) #unconstrained model 
model.fit(Xtrain,ytrain)

DecisionTreeClassifier(random_state=42)

In [16]:
trainp=model.predict(Xtrain)
testp=model.predict(Xtest)

In [17]:
printscores(ytrain,trainp) # model works perfect with train data as it is not constrained.

AUC: 1.0
accuracy: 1.0
recall  : 1.0
precision : 1.0
f1-score :  1.0


In [18]:
printscores(ytest,testp)

AUC: 0.8546863520274289
accuracy: 0.9190404797601199
recall  : 0.7623762376237624
precision : 0.719626168224299
f1-score :  0.7403846153846154


In [19]:
Xtrain.shape

(2666, 19)

In [20]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=15)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'voice', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecal', 'tecahr', 'tnmin', 'tncal', 'tnchar', 'tical', 'tichar',
       'ncsc'],
      dtype='object')

In [21]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [22]:
obj.best_params_

{'max_depth': 20}

In [23]:
obj.best_score_

0.7093643198906356

In [24]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth= 20)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=20 and 15 cols

DecisionTreeClassifier(max_depth=20, random_state=42)

In [25]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [26]:
printscores(ytrain,trainp)

AUC: 0.9908376963350785
accuracy: 0.9973743435858965
recall  : 0.981675392670157
precision : 1.0
f1-score :  0.9907529722589168


In [27]:
printscores(ytest,testp)

AUC: 0.8509691075114579
accuracy: 0.9265367316341829
recall  : 0.7425742574257426
precision : 0.7653061224489796
f1-score :  0.7537688442211056


In [28]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=14)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'voice', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecal', 'tecahr', 'tnmin', 'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [29]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [30]:
impcols

Index(['acclen', 'intplan', 'voice', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecal', 'tecahr', 'tnmin', 'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [31]:
obj.best_params_

{'max_depth': 8, 'min_samples_split': 11}

In [32]:
obj.best_score_

0.7197197539302802

In [33]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth= 8,min_samples_split=11)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=8,min_samples_split=11 and 14 cols

DecisionTreeClassifier(max_depth=8, min_samples_split=11, random_state=42)

In [34]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [35]:
printscores(ytrain,trainp)

AUC: 0.8954977031202721
accuracy: 0.9666166541635409
recall  : 0.7958115183246073
precision : 0.9650793650793651
f1-score :  0.8723098995695839


In [36]:
printscores(ytest,testp)

AUC: 0.8525522163523773
accuracy: 0.9430284857571214
recall  : 0.7227722772277227
precision : 0.8795180722891566
f1-score :  0.7934782608695651


In [37]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=13)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'voice', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecahr', 'tnmin', 'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [38]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [39]:
obj.best_params_

{'max_depth': 13}

In [40]:
obj.best_score_

0.7302118933697881

In [41]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth= 13)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=13 and 13 cols

DecisionTreeClassifier(max_depth=13, random_state=42)

In [42]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [43]:
printscores(ytrain,trainp)

AUC: 0.9528795811518325
accuracy: 0.986496624156039
recall  : 0.9057591623036649
precision : 1.0
f1-score :  0.9505494505494505


In [44]:
printscores(ytest,testp)

AUC: 0.8530857502711402
accuracy: 0.9370314842578711
recall  : 0.7326732673267327
precision : 0.8314606741573034
f1-score :  0.7789473684210527


In [45]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=12)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdcal', 'tdchar', 'temin', 'tecahr',
       'tnmin', 'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [46]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [47]:
obj.best_params_

{'max_depth': 14, 'min_samples_split': 7}

In [48]:
obj.best_score_

0.738004101161996

In [49]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth= 14,min_samples_split=7)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=14,min_samples_split=7 and 12 cols

DecisionTreeClassifier(max_depth=14, min_samples_split=7, random_state=42)

In [50]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [51]:
printscores(ytrain,trainp)

AUC: 0.9170968540541532
accuracy: 0.9737434358589647
recall  : 0.837696335078534
precision : 0.975609756097561
f1-score :  0.9014084507042254


In [52]:
printscores(ytest,testp)

AUC: 0.8674037015008921
accuracy: 0.9475262368815592
recall  : 0.7524752475247525
precision : 0.8837209302325582
f1-score :  0.8128342245989305


In [53]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [54]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [55]:
obj.best_params_

{'max_depth': 22, 'min_samples_split': 7}

In [56]:
obj.best_score_

0.7379699248120302

In [57]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=22,min_samples_split=7)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=22,min_samples_split=7 and 11 cols

DecisionTreeClassifier(max_depth=22, min_samples_split=7, random_state=42)

In [58]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [59]:
printscores(ytrain,trainp)

AUC: 0.9301858592897553
accuracy: 0.9774943735933983
recall  : 0.8638743455497382
precision : 0.9763313609467456
f1-score :  0.9166666666666667


In [60]:
printscores(ytest,testp)

AUC: 0.8656369170485954
accuracy: 0.9445277361319341
recall  : 0.7524752475247525
precision : 0.8636363636363636
f1-score :  0.8042328042328043


In [61]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=10)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [62]:
grid1={"max_depth" : list(range(5,30))}
grid2={ "max_depth": list(range(5,30)),"min_samples_split":list(range(5,30))}
grid3={ "min_samples_leaf" : list(range(5,30)),"min_samples_split" :list(range(5,30))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5, scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14, 15, 16, 17, 18, 19, 20, 21,
                                        

In [63]:
obj.best_params_

{'max_depth': 22, 'min_samples_split': 6}

In [64]:
obj.best_score_

0.7405673274094328

In [65]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=22,min_samples_split= 6)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=22,min_samples_split= 6 and 10 cols

DecisionTreeClassifier(max_depth=22, min_samples_split=6, random_state=42)

In [66]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [67]:
printscores(ytrain,trainp)

AUC: 0.9622516298218428
accuracy: 0.9857464366091523
recall  : 0.9293193717277487
precision : 0.9699453551912568
f1-score :  0.9491978609625669


In [68]:
printscores(ytest,testp)

AUC: 0.8580362453206452
accuracy: 0.9385307346326837
recall  : 0.7425742574257426
precision : 0.8333333333333334
f1-score :  0.7853403141361257


In [69]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=9)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tnmin', 'tical',
       'tichar', 'ncsc'],
      dtype='object')

In [70]:
grid1={"max_depth" : list(range(5,40))}
grid2={ "max_depth": list(range(5,40)),"min_samples_split":list(range(5,40))}
grid3={ "min_samples_leaf" : list(range(5,40)),"min_samples_split" :list(range(5,40))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28, 29,
                                                30, 31, 32, 33, 34, ...]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                            

In [71]:
obj.best_params_

{'max_depth': 25, 'min_samples_split': 5}

In [72]:
obj.best_score_

0.7380382775119617

In [73]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=25,min_samples_split=5)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=25,min_samples_split=5 and 9 cols

DecisionTreeClassifier(max_depth=25, min_samples_split=5, random_state=42)

In [74]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [75]:
printscores(ytrain,trainp)

AUC: 0.9729417481959638
accuracy: 0.9891222805701425
recall  : 0.9502617801047121
precision : 0.9731903485254692
f1-score :  0.9615894039735098


In [76]:
printscores(ytest,testp)

AUC: 0.8638701325962985
accuracy: 0.9415292353823088
recall  : 0.7524752475247525
precision : 0.8444444444444444
f1-score :  0.7958115183246073


In [77]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=8)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tnmin', 'tical', 'tichar',
       'ncsc'],
      dtype='object')

In [78]:
grid1={"max_depth" : list(range(5,40))}
grid2={ "max_depth": list(range(5,40)),"min_samples_split":list(range(5,40))}
grid3={ "min_samples_leaf" : list(range(5,40)),"min_samples_split" :list(range(5,40))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28, 29,
                                                30, 31, 32, 33, 34, ...]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                            

In [79]:
obj.best_params_

{'min_samples_leaf': 5, 'min_samples_split': 5}

In [80]:
obj.best_score_

0.6934723171565278

In [81]:
model=tree.DecisionTreeClassifier(random_state=42,min_samples_leaf=5,min_samples_split=5)
model.fit(Xtrain[impcols],ytrain) # Model with only min_samples_leaf=5,min_samples_split=5 and 8 cols

DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=5, random_state=42)

In [82]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [83]:
printscores(ytrain,trainp)

AUC: 0.8765071840529611
accuracy: 0.9564891222805701
recall  : 0.7643979057591623
precision : 0.9182389937106918
f1-score :  0.8342857142857143


In [84]:
printscores(ytest,testp)

AUC: 0.8134817898751007
accuracy: 0.9250374812593704
recall  : 0.6534653465346535
precision : 0.8148148148148148
f1-score :  0.7252747252747253


In [85]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=7)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['tdmin', 'tdchar', 'temin', 'tnmin', 'tical', 'tichar', 'ncsc'], dtype='object')

In [86]:
grid1={"max_depth" : list(range(5,40))}
grid2={ "max_depth": list(range(5,40)),"min_samples_split":list(range(5,40))}
grid3={ "min_samples_leaf" : list(range(5,40)),"min_samples_split" :list(range(5,40))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=paramgrid, cv=5 , scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                        26, 27, 28, 29, 30, 31, 32, 33, 34, ...],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28, 29,
                                                30, 31, 32, 33, 34, ...]},
                         {'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                            

In [87]:
obj.best_params_

{'max_depth': 22}

In [88]:
obj.best_score_

0.5571428571428572

In [89]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth= 22)
model.fit(Xtrain[impcols],ytrain) # Model with only max_depth=22 and 7 cols

DecisionTreeClassifier(max_depth=22, random_state=42)

In [90]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [91]:
printscores(ytrain,trainp)

AUC: 0.9947643979057592
accuracy: 0.9984996249062266
recall  : 0.9895287958115183
precision : 1.0
f1-score :  0.9947368421052631


In [92]:
printscores(ytest,testp)

AUC: 0.7079557779099465
accuracy: 0.8425787106446777
recall  : 0.5148514851485149
precision : 0.48148148148148145
f1-score :  0.4976076555023924


In [93]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=7,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf1=pd.DataFrame(gridobj.cv_results_)
cvdf1[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.675313,0.489747
1,{'n_features_to_select': 6},0.773509,0.62013
2,{'n_features_to_select': 7},0.795778,0.672386
3,{'n_features_to_select': 8},0.797739,0.696104
4,{'n_features_to_select': 9},0.799051,0.706596
5,{'n_features_to_select': 10},0.799706,0.698701
6,{'n_features_to_select': 11},0.799059,0.693541
7,{'n_features_to_select': 12},0.798406,0.672727
8,{'n_features_to_select': 13},0.797752,0.696275
9,{'n_features_to_select': 14},0.798404,0.656972


In [94]:
gridobj.best_params_

{'n_features_to_select': 9}

In [95]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=8,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf1=pd.DataFrame(gridobj.cv_results_)
cvdf1[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.654428,0.447573
1,{'n_features_to_select': 6},0.772258,0.596446
2,{'n_features_to_select': 7},0.796479,0.656664
3,{'n_features_to_select': 8},0.837038,0.70123
4,{'n_features_to_select': 9},0.84096,0.703896
5,{'n_features_to_select': 10},0.840304,0.709091
6,{'n_features_to_select': 11},0.842923,0.70663
7,{'n_features_to_select': 12},0.837041,0.690943
8,{'n_features_to_select': 13},0.839661,0.685612
9,{'n_features_to_select': 14},0.839008,0.680485


In [96]:
gridobj.best_params_

{'n_features_to_select': 10}

In [97]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=9,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf1=pd.DataFrame(gridobj.cv_results_)
cvdf1[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.695635,0.457997
1,{'n_features_to_select': 6},0.753215,0.51825
2,{'n_features_to_select': 7},0.817364,0.607382
3,{'n_features_to_select': 8},0.838337,0.664696
4,{'n_features_to_select': 9},0.852097,0.690602
5,{'n_features_to_select': 10},0.861903,0.716986
6,{'n_features_to_select': 11},0.865171,0.714388
7,{'n_features_to_select': 12},0.863862,0.714422
8,{'n_features_to_select': 13},0.86648,0.711859
9,{'n_features_to_select': 14},0.867134,0.696138


In [98]:
gridobj.best_params_

{'n_features_to_select': 10}

In [99]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=10,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.753254,0.50499
1,{'n_features_to_select': 6},0.81934,0.554853
2,{'n_features_to_select': 7},0.858633,0.651504
3,{'n_features_to_select': 8},0.86714,0.688175
4,{'n_features_to_select': 9},0.871068,0.690738
5,{'n_features_to_select': 10},0.872375,0.716951
6,{'n_features_to_select': 11},0.871068,0.701333
7,{'n_features_to_select': 12},0.878918,0.704067
8,{'n_features_to_select': 13},0.88219,0.704135
9,{'n_features_to_select': 14},0.880229,0.698872


In [100]:
gridobj.best_params_

{'n_features_to_select': 10}

In [101]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=11,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.811437,0.536671
1,{'n_features_to_select': 6},0.847468,0.565584
2,{'n_features_to_select': 7},0.882846,0.651743
3,{'n_features_to_select': 8},0.884155,0.701299
4,{'n_features_to_select': 9},0.88612,0.698633
5,{'n_features_to_select': 10},0.886774,0.719651
6,{'n_features_to_select': 11},0.892005,0.711791
7,{'n_features_to_select': 12},0.891351,0.703999
8,{'n_features_to_select': 13},0.894628,0.709296
9,{'n_features_to_select': 14},0.899859,0.701504


In [102]:
gridobj.best_params_

{'n_features_to_select': 10}

In [103]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=12,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.840249,0.499795
1,{'n_features_to_select': 6},0.89394,0.56039
2,{'n_features_to_select': 7},0.901155,0.662235
3,{'n_features_to_select': 8},0.90771,0.682878
4,{'n_features_to_select': 9},0.903789,0.69607
5,{'n_features_to_select': 10},0.903789,0.717088
6,{'n_features_to_select': 11},0.905096,0.727546
7,{'n_features_to_select': 12},0.905754,0.696241
8,{'n_features_to_select': 13},0.909022,0.696172
9,{'n_features_to_select': 14},0.912948,0.685748


In [104]:
gridobj.best_params_

{'n_features_to_select': 11}

In [105]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=13,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.883527,0.51825
1,{'n_features_to_select': 6},0.901843,0.60147
2,{'n_features_to_select': 7},0.924067,0.635475
3,{'n_features_to_select': 8},0.921451,0.677649
4,{'n_features_to_select': 9},0.920144,0.701299
5,{'n_features_to_select': 10},0.923414,0.730212
6,{'n_features_to_select': 11},0.921451,0.719651
7,{'n_features_to_select': 12},0.923416,0.727512
8,{'n_features_to_select': 13},0.92015,0.706664
9,{'n_features_to_select': 14},0.928649,0.706596


In [106]:
gridobj.best_params_

{'n_features_to_select': 10}

In [107]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=14,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.91097,0.507553
1,{'n_features_to_select': 6},0.927372,0.552051
2,{'n_features_to_select': 7},0.936492,0.640943
3,{'n_features_to_select': 8},0.937799,0.68028
4,{'n_features_to_select': 9},0.931917,0.714422
5,{'n_features_to_select': 10},0.931263,0.730212
6,{'n_features_to_select': 11},0.934544,0.714422
7,{'n_features_to_select': 12},0.929313,0.706733
8,{'n_features_to_select': 13},0.933893,0.711962
9,{'n_features_to_select': 14},0.935849,0.693643


In [108]:
gridobj.best_params_

{'n_features_to_select': 10}

In [109]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=15,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.912963,0.541661
1,{'n_features_to_select': 6},0.955457,0.578195
2,{'n_features_to_select': 7},0.95745,0.609706
3,{'n_features_to_select': 8},0.941078,0.71702
4,{'n_features_to_select': 9},0.94108,0.709228
5,{'n_features_to_select': 10},0.937158,0.72758
6,{'n_features_to_select': 11},0.940424,0.724949
7,{'n_features_to_select': 12},0.939115,0.717054
8,{'n_features_to_select': 13},0.94174,0.714422
9,{'n_features_to_select': 14},0.948278,0.701367


In [110]:
gridobj.best_params_

{'n_features_to_select': 10}

In [111]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=16,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.908996,0.544361
1,{'n_features_to_select': 6},0.967927,0.515482
2,{'n_features_to_select': 7},0.966598,0.601777
3,{'n_features_to_select': 8},0.949587,0.69337
4,{'n_features_to_select': 9},0.94698,0.685509
5,{'n_features_to_select': 10},0.951555,0.727683
6,{'n_features_to_select': 11},0.954829,0.711893
7,{'n_features_to_select': 12},0.954173,0.717088
8,{'n_features_to_select': 13},0.958751,0.706596
9,{'n_features_to_select': 14},0.960716,0.706699


In [112]:
gridobj.best_params_

{'n_features_to_select': 10}

In [113]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=17,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.96006,0.486535
1,{'n_features_to_select': 6},0.96594,0.562406
2,{'n_features_to_select': 7},0.97121,0.602016
3,{'n_features_to_select': 8},0.961374,0.677751
4,{'n_features_to_select': 9},0.955491,0.730144
5,{'n_features_to_select': 10},0.953524,0.714491
6,{'n_features_to_select': 11},0.962012,0.717122
7,{'n_features_to_select': 12},0.967907,0.70933
8,{'n_features_to_select': 13},0.98102,0.69877
9,{'n_features_to_select': 14},0.975138,0.693609


In [114]:
gridobj.best_params_

{'n_features_to_select': 9}

In [115]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=18,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.936488,0.523479
1,{'n_features_to_select': 6},0.974443,0.54689
2,{'n_features_to_select': 7},0.967912,0.591627
3,{'n_features_to_select': 8},0.982338,0.638619
4,{'n_features_to_select': 9},0.97185,0.717122
5,{'n_features_to_select': 10},0.96009,0.727512
6,{'n_features_to_select': 11},0.962049,0.717122
7,{'n_features_to_select': 12},0.969242,0.701401
8,{'n_features_to_select': 13},0.974467,0.709228
9,{'n_features_to_select': 14},0.973813,0.690943


In [116]:
gridobj.best_params_

{'n_features_to_select': 10}

In [117]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=19,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.95616,0.460185
1,{'n_features_to_select': 6},0.988863,0.525837
2,{'n_features_to_select': 7},0.972504,0.604204
3,{'n_features_to_select': 8},0.971853,0.64877
4,{'n_features_to_select': 9},0.967937,0.693404
5,{'n_features_to_select': 10},0.965323,0.719651
6,{'n_features_to_select': 11},0.9758,0.735509
7,{'n_features_to_select': 12},0.981029,0.727546
8,{'n_features_to_select': 13},0.977094,0.719788
9,{'n_features_to_select': 14},0.988874,0.688278


In [118]:
gridobj.best_params_

{'n_features_to_select': 11}

In [119]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=20,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.99082,0.549658
1,{'n_features_to_select': 6},0.990164,0.557211
2,{'n_features_to_select': 7},0.994113,0.630759
3,{'n_features_to_select': 8},0.977101,0.706528
4,{'n_features_to_select': 9},0.977758,0.698633
5,{'n_features_to_select': 10},0.977758,0.722351
6,{'n_features_to_select': 11},0.982332,0.727614
7,{'n_features_to_select': 12},0.98561,0.704101
8,{'n_features_to_select': 13},0.986255,0.701504
9,{'n_features_to_select': 14},0.989521,0.70147


In [120]:
gridobj.best_params_

{'n_features_to_select': 11}

In [121]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=22,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.981026,0.515653
1,{'n_features_to_select': 6},0.997381,0.520643
2,{'n_features_to_select': 7},0.998689,0.559809
3,{'n_features_to_select': 8},0.98954,0.63838
4,{'n_features_to_select': 9},0.988887,0.698667
5,{'n_features_to_select': 10},0.986264,0.727546
6,{'n_features_to_select': 11},0.994771,0.725051
7,{'n_features_to_select': 12},0.994118,0.70147
8,{'n_features_to_select': 13},0.994771,0.70147
9,{'n_features_to_select': 14},0.998039,0.691046


In [122]:
gridobj.best_params_

{'n_features_to_select': 10}

In [123]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=25,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.999346,0.523377
1,{'n_features_to_select': 6},1.0,0.515482
2,{'n_features_to_select': 7},1.0,0.625256
3,{'n_features_to_select': 8},0.994118,0.683049
4,{'n_features_to_select': 9},0.994118,0.727614
5,{'n_features_to_select': 10},0.994118,0.719686
6,{'n_features_to_select': 11},0.999346,0.709262
7,{'n_features_to_select': 12},1.0,0.70147
8,{'n_features_to_select': 13},1.0,0.717259
9,{'n_features_to_select': 14},1.0,0.685783


In [124]:
gridobj.best_params_

{'n_features_to_select': 9}

In [125]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=26,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)
cvdf=pd.DataFrame(gridobj.cv_results_)
cvdf[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.998039,0.539166
1,{'n_features_to_select': 6},1.0,0.507587
2,{'n_features_to_select': 7},1.0,0.619993
3,{'n_features_to_select': 8},0.996078,0.693575
4,{'n_features_to_select': 9},0.996078,0.71972
5,{'n_features_to_select': 10},0.999346,0.717054
6,{'n_features_to_select': 11},0.999346,0.709262
7,{'n_features_to_select': 12},1.0,0.70147
8,{'n_features_to_select': 13},1.0,0.717259
9,{'n_features_to_select': 14},1.0,0.685783


In [126]:
gridobj.best_params_

{'n_features_to_select': 9}

# Best model using decision tree found with 10 columns
## - max_depth=10
## - max_depth=10,min_sample_split=3

In [127]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=9)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tnmin', 'tical',
       'tichar', 'ncsc'],
      dtype='object')

In [128]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10 and 9 cols

DecisionTreeClassifier(max_depth=10, random_state=42)

In [129]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [130]:
printscores(ytrain,trainp)

AUC: 0.9325893307415116
accuracy: 0.9797449362340586
recall  : 0.8664921465968587
precision : 0.9910179640718563
f1-score :  0.9245810055865922


In [131]:
printscores(ytest,testp)

AUC: 0.8849053633278523
accuracy: 0.9565217391304348
recall  : 0.7821782178217822
precision : 0.9186046511627907
f1-score :  0.8449197860962566


In [132]:
pd.crosstab(ytest, testp, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,559,7,566
1,22,79,101
All,581,86,667


In [133]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=3)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10,min_sample_split=3 and 9 cols

DecisionTreeClassifier(max_depth=10, min_samples_split=3, random_state=42)

In [134]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [135]:
printscores(ytrain,trainp)

AUC: 0.9286626291708311
accuracy: 0.9786196549137285
recall  : 0.8586387434554974
precision : 0.9909365558912386
f1-score :  0.9200561009817672


In [136]:
printscores(ytest,testp)

AUC: 0.8758877654549908
accuracy: 0.9550224887556222
recall  : 0.7623762376237624
precision : 0.927710843373494
f1-score :  0.8369565217391305


In [137]:
pd.crosstab(ytest, testp, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,560,6,566
1,24,77,101
All,584,83,667


In [138]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=6)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10,min_samples_split=6 and 9 cols

DecisionTreeClassifier(max_depth=10, min_samples_split=6, random_state=42)

In [139]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [140]:
printscores(ytrain,trainp)

AUC: 0.920152483472552
accuracy: 0.9752438109527382
recall  : 0.8429319371727748
precision : 0.9817073170731707
f1-score :  0.9070422535211267


In [141]:
printscores(ytest,testp)

AUC: 0.8700538781793373
accuracy: 0.952023988005997
recall  : 0.7524752475247525
precision : 0.9156626506024096
f1-score :  0.826086956521739


In [142]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=8,min_samples_split=7)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=8,min_samples_split=7 and 9 cols

DecisionTreeClassifier(max_depth=8, min_samples_split=7, random_state=42)

In [143]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [144]:
printscores(ytrain,trainp)

AUC: 0.9003000614335096
accuracy: 0.9692423105776444
recall  : 0.8036649214659686
precision : 0.9777070063694268
f1-score :  0.882183908045977


In [145]:
printscores(ytest,testp)

AUC: 0.8566193191757339
accuracy: 0.9430284857571214
recall  : 0.7326732673267327
precision : 0.8705882352941177
f1-score :  0.7956989247311828


In [146]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=11,min_samples_split=5)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=11,min_samples_split=5 and 9 cols

DecisionTreeClassifier(max_depth=11, min_samples_split=5, random_state=42)

In [147]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [148]:
printscores(ytrain,trainp)

AUC: 0.9256069997524321
accuracy: 0.977119279819955
recall  : 0.8534031413612565
precision : 0.9848942598187311
f1-score :  0.914446002805049


In [149]:
printscores(ytest,testp)

AUC: 0.8651033831298325
accuracy: 0.9505247376311844
recall  : 0.7425742574257426
precision : 0.9146341463414634
f1-score :  0.819672131147541


In [150]:
len(impcols)

9

In [151]:
maxdepthvalues=list(range(5,15))
minsamplessplit=list(range(5,15))
d={"max_depth": maxdepthvalues,"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5 , scoring="recall")
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                         'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                               14]},
             scoring='recall')

In [152]:
obj.best_params_ #{'max_depth': 8, 'min_samples_split': 7}

{'max_depth': 8, 'min_samples_split': 7}

In [153]:
minsamplessplit=list(range(5,15))
d={"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5 , scoring="recall")
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_ #{'min_samples_split': 6}

{'min_samples_split': 6}

In [154]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=10)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [155]:
minsamplessplit=list(range(5,15))
d={"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5 , scoring="recall")
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_ #{'min_samples_split': 5}

{'min_samples_split': 5}

In [156]:
maxdepthvalues=list(range(5,15))
minsamplessplit=list(range(5,15))
d={"max_depth": maxdepthvalues,"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5, scoring="recall")
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_ #{'max_depth': 13, 'min_samples_split': 8}

{'max_depth': 13, 'min_samples_split': 8}

In [157]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=5)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10,min_samples_split=5 and 10 cols

DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)

In [158]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [159]:
printscores(ytrain,trainp)

AUC: 0.9214613839961123
accuracy: 0.9756189047261815
recall  : 0.8455497382198953
precision : 0.9817629179331308
f1-score :  0.9085794655414909


In [160]:
printscores(ytest,testp)

AUC: 0.8651033831298325
accuracy: 0.9505247376311844
recall  : 0.7425742574257426
precision : 0.9146341463414634
f1-score :  0.819672131147541


In [161]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=3)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10,min_samples_split=3 and 10 cols

DecisionTreeClassifier(max_depth=10, min_samples_split=3, random_state=42)

In [162]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [163]:
printscores(ytrain,trainp)

AUC: 0.9286626291708311
accuracy: 0.9786196549137285
recall  : 0.8586387434554974
precision : 0.9909365558912386
f1-score :  0.9200561009817672


In [164]:
printscores(ytest,testp)

AUC: 0.8758877654549908
accuracy: 0.9550224887556222
recall  : 0.7623762376237624
precision : 0.927710843373494
f1-score :  0.8369565217391305


In [165]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=13,min_samples_split=8)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=14,min_samples_split=6 and 10 cols

DecisionTreeClassifier(max_depth=13, min_samples_split=8, random_state=42)

In [166]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [167]:
printscores(ytrain,trainp)

AUC: 0.9105523514363522
accuracy: 0.9718679669917479
recall  : 0.824607329842932
precision : 0.9752321981424149
f1-score :  0.8936170212765957


In [168]:
printscores(ytest,testp)

AUC: 0.8624532064513871
accuracy: 0.9460269865067467
recall  : 0.7425742574257426
precision : 0.8823529411764706
f1-score :  0.8064516129032258


In [169]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=10 and 10 cols

DecisionTreeClassifier(max_depth=10, random_state=42)

In [170]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [171]:
printscores(ytrain,trainp)

AUC: 0.9325893307415116
accuracy: 0.9797449362340586
recall  : 0.8664921465968587
precision : 0.9910179640718563
f1-score :  0.9245810055865922


In [172]:
printscores(ytest,testp)

AUC: 0.8799548682783472
accuracy: 0.9550224887556222
recall  : 0.7722772277227723
precision : 0.9176470588235294
f1-score :  0.8387096774193548


In [173]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [174]:
maxdepthvalues=list(range(5,15))
minsamplessplit=list(range(5,15))
d={"max_depth": maxdepthvalues,"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5, scoring="recall")
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_ 

{'max_depth': 14, 'min_samples_split': 8}

In [175]:
minsamplessplit=list(range(5,15))
d={"min_samples_split":minsamplessplit}
obj=model_selection.GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=d, cv=5 , scoring="recall")
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_ 

{'min_samples_split': 7}

In [176]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=9)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=9 and 11 cols

DecisionTreeClassifier(max_depth=9, random_state=42)

In [177]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [178]:
printscores(ytrain,trainp)

AUC: 0.9277915570185493
accuracy: 0.9789947486871718
recall  : 0.856020942408377
precision : 0.9969512195121951
f1-score :  0.9211267605633803


In [179]:
printscores(ytest,testp)

AUC: 0.8691704859531889
accuracy: 0.9505247376311844
recall  : 0.7524752475247525
precision : 0.9047619047619048
f1-score :  0.8216216216216216


In [180]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=9,min_samples_split=7)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=9,min_samples_split=7 and 11 cols

DecisionTreeClassifier(max_depth=9, min_samples_split=7, random_state=42)

In [181]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [182]:
printscores(ytrain,trainp)

AUC: 0.9048835055611081
accuracy: 0.9714928732183046
recall  : 0.8115183246073299
precision : 0.9872611464968153
f1-score :  0.8908045977011495


In [183]:
printscores(ytest,testp)

AUC: 0.854319000804674
accuracy: 0.9460269865067467
recall  : 0.7227722772277227
precision : 0.9012345679012346
f1-score :  0.8021978021978022


In [184]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=14,min_samples_split=8)
model.fit(Xtrain[impcols],ytrain) # Model with max_depth=18,min_samples_split=6 and 11 cols

DecisionTreeClassifier(max_depth=14, min_samples_split=8, random_state=42)

In [185]:
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [186]:
printscores(ytrain,trainp)

AUC: 0.9116423377742732
accuracy: 0.9718679669917479
recall  : 0.8272251308900523
precision : 0.9723076923076923
f1-score :  0.8939179632248939


In [187]:
printscores(ytest,testp)

AUC: 0.8633365986775355
accuracy: 0.9475262368815592
recall  : 0.7425742574257426
precision : 0.8928571428571429
f1-score :  0.8108108108108107


In [188]:
len(impcols)

11