In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.linear_model import LogisticRegressionCV, LassoCV

In [2]:
df=pd.read_csv("churn_train.csv")

In [3]:
df["label"].replace({"True.":1,"False.":0},inplace=True)
df["intplan"].replace({"yes":1,"no":0},inplace=True)
df["voice"].replace({"yes":1,"no":0},inplace=True)

In [4]:
df["label"].value_counts()

0    2850
1     483
Name: label, dtype: int64

In [5]:
df["arcode"].value_counts()

415    1655
510     840
408     838
Name: arcode, dtype: int64

In [6]:
df["st"].unique()

array(['KS', 'OH', 'NJ', 'OK', 'AL', 'MA', 'MO', 'LA', 'WV', 'IN', 'RI',
       'IA', 'MT', 'NY', 'ID', 'VT', 'VA', 'TX', 'FL', 'CO', 'AZ', 'SC',
       'NE', 'WY', 'HI', 'IL', 'NH', 'GA', 'AK', 'MD', 'AR', 'WI', 'OR',
       'MI', 'DE', 'UT', 'CA', 'MN', 'SD', 'NC', 'WA', 'NM', 'NV', 'DC',
       'KY', 'ME', 'MS', 'TN', 'PA', 'CT', 'ND'], dtype=object)

In [7]:
df["st"].nunique()

51

In [8]:
df[:2]

Unnamed: 0,st,acclen,arcode,phnum,intplan,voice,nummailmes,tdmin,tdcal,tdchar,...,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc,label
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0


# Encoding Discreet Values

In [9]:
label_encoder = preprocessing.LabelEncoder()
df["st"] = label_encoder.fit_transform(df["st"])
print (df.dtypes)

st              int32
acclen          int64
arcode          int64
phnum          object
intplan         int64
voice           int64
nummailmes      int64
tdmin         float64
tdcal           int64
tdchar        float64
temin         float64
tecal           int64
tecahr        float64
tnmin         float64
tncal           int64
tnchar        float64
timin         float64
tical           int64
tichar        float64
ncsc            int64
label           int64
dtype: object


In [10]:
df[:2]

Unnamed: 0,st,acclen,arcode,phnum,intplan,voice,nummailmes,tdmin,tdcal,tdchar,...,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc,label
0,16,128,415,382-4657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,35,107,415,371-7191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0


In [11]:
X=df.drop({"phnum","label"}, axis=1)
y=df["label"]

In [12]:
Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X,y,test_size=.2,random_state=42)

In [13]:
Xtrain[ :2]

Unnamed: 0,st,acclen,arcode,intplan,voice,nummailmes,tdmin,tdcal,tdchar,temin,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc
817,44,243,510,0,0,0,95.5,92,16.24,163.7,63,13.91,264.2,118,11.89,6.6,6,1.78,2
1373,40,108,415,0,0,0,112.0,105,19.04,193.7,110,16.46,208.9,93,9.4,4.1,4,1.11,4


In [14]:
def printscores(actual,pred):
    print("AUC:",metrics.roc_auc_score(actual,pred))
    print("accuracy:",metrics.accuracy_score(actual,pred))
    print("recall  :",metrics.recall_score(actual,pred))
    print("precision :",metrics.precision_score(actual,pred))
    print("f1-score : ",metrics.f1_score(actual,pred))

In [15]:
Xtrain.shape

(2666, 19)

In [16]:
Xtrain[:5]

Unnamed: 0,st,acclen,arcode,intplan,voice,nummailmes,tdmin,tdcal,tdchar,temin,tecal,tecahr,tnmin,tncal,tnchar,timin,tical,tichar,ncsc
817,44,243,510,0,0,0,95.5,92,16.24,163.7,63,13.91,264.2,118,11.89,6.6,6,1.78,2
1373,40,108,415,0,0,0,112.0,105,19.04,193.7,110,16.46,208.9,93,9.4,4.1,4,1.11,4
679,43,75,415,1,0,0,222.4,78,37.81,327.0,111,27.8,208.0,104,9.36,8.7,9,2.35,1
56,5,141,415,0,0,0,126.9,98,21.57,180.0,62,15.3,140.8,128,6.34,8.0,2,2.16,1
1993,15,86,510,0,0,0,216.3,96,36.77,266.3,77,22.64,214.0,110,9.63,4.5,3,1.22,0


In [17]:
from sklearn import tree
model=tree.DecisionTreeClassifier(random_state=42) #unconstrained model 
model.fit(Xtrain,ytrain)

DecisionTreeClassifier(random_state=42)

In [18]:
trainp=model.predict(Xtrain)
testp=model.predict(Xtest)

In [19]:
printscores(ytrain,trainp)

AUC: 1.0
accuracy: 1.0
recall  : 1.0
precision : 1.0
f1-score :  1.0


In [20]:
printscores(ytest,testp)

AUC: 0.8546863520274289
accuracy: 0.9190404797601199
recall  : 0.7623762376237624
precision : 0.719626168224299
f1-score :  0.7403846153846154


In [21]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=9)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'timin',
       'tical', 'ncsc'],
      dtype='object')

In [22]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=14,min_samples_split=14,oob_score=True) #10 S-8,14 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [23]:
printscores(ytrain,trainpred)

AUC: 0.9037797654523615
accuracy: 0.9658664666166542
recall  : 0.8167539267015707
precision : 0.9369369369369369
f1-score :  0.8727272727272728


In [24]:
printscores(ytest,testpred)

AUC: 0.849902039673932
accuracy: 0.9385307346326837
recall  : 0.7227722772277227
precision : 0.8488372093023255
f1-score :  0.7807486631016043


In [25]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=10)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'timin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [26]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,min_samples_split=6,oob_score=True) #13 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [27]:
printscores(ytrain,trainpred)

AUC: 0.9247359276001503
accuracy: 0.9774943735933983
recall  : 0.8507853403141361
precision : 0.9908536585365854
f1-score :  0.9154929577464789


In [28]:
printscores(ytest,testpred)

AUC: 0.8318668439282092
accuracy: 0.9355322338830585
recall  : 0.6831683168316832
precision : 0.8625
f1-score :  0.7624309392265194


In [29]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin', 'tnchar',
       'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [30]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=14,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [31]:
printscores(ytrain,trainpred)

AUC: 0.9581151832460733
accuracy: 0.9879969992498124
recall  : 0.9162303664921466
precision : 1.0
f1-score :  0.9562841530054645


In [32]:
printscores(ytest,testpred)

AUC: 0.8440681523982787
accuracy: 0.9355322338830585
recall  : 0.7128712871287128
precision : 0.8372093023255814
f1-score :  0.770053475935829


In [33]:
rfobj=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,min_samples_split=6)
grid1={"max_features" : [10,11,12,13,14,15]}
paramgrid=[grid1]
obj=model_selection.GridSearchCV(estimator=rfobj,param_grid=grid1,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=13, min_samples_split=6,
                                              random_state=42),
             n_jobs=-1, param_grid={'max_features': [10, 11, 12, 13, 14, 15]},
             scoring='recall')

In [34]:
obj.best_score_

0.7196855775803145

In [35]:
obj.best_params_

{'max_features': 10}

In [36]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=12)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'nummailmes', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tnchar', 'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [37]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,min_samples_split=6,oob_score=True) #8, 11, 12, 13, 18
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [38]:
printscores(ytrain,trainpred)

AUC: 0.9238648554478686
accuracy: 0.9778694673668417
recall  : 0.8481675392670157
precision : 0.9969230769230769
f1-score :  0.9165487977369166


In [39]:
printscores(ytest,testpred)

AUC: 0.8651033831298325
accuracy: 0.9505247376311844
recall  : 0.7425742574257426
precision : 0.9146341463414634
f1-score :  0.819672131147541


In [40]:
pd.crosstab(ytest, testpred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,559,7,566
1,26,75,101
All,585,82,667


In [41]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=13)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'nummailmes', 'tdmin', 'tdchar', 'temin', 'tecahr',
       'tnmin', 'tnchar', 'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [42]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=14,min_samples_split=5,oob_score=True) # sam-14
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [43]:
printscores(ytrain,trainpred)

AUC: 0.9332460732984293
accuracy: 0.9808702175543886
recall  : 0.8664921465968587
precision : 1.0
f1-score :  0.9284712482468443


In [44]:
printscores(ytest,testpred)

AUC: 0.8610362803064758
accuracy: 0.9505247376311844
recall  : 0.7326732673267327
precision : 0.925
f1-score :  0.8176795580110497


In [45]:
pd.crosstab(ytest, testpred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,560,6,566
1,27,74,101
All,587,80,667


In [46]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=14)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'nummailmes', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecahr', 'tnmin', 'tnchar', 'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [47]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=14,min_samples_split=8,oob_score=True) # sam-12
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [48]:
printscores(ytrain,trainpred)

AUC: 0.926482656494989
accuracy: 0.9786196549137285
recall  : 0.8534031413612565
precision : 0.9969418960244648
f1-score :  0.919605077574048


In [49]:
printscores(ytest,testpred)

AUC: 0.8651033831298325
accuracy: 0.9505247376311844
recall  : 0.7425742574257426
precision : 0.9146341463414634
f1-score :  0.819672131147541


In [50]:
pd.crosstab(ytest, testpred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,559,7,566
1,26,75,101
All,585,82,667


In [51]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=15)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'nummailmes', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecahr', 'tnmin', 'tncal', 'tnchar', 'timin', 'tical', 'tichar',
       'ncsc'],
      dtype='object')

In [52]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [53]:
printscores(ytrain,trainpred)

AUC: 0.9463350785340314
accuracy: 0.9846211552888222
recall  : 0.8926701570680629
precision : 1.0
f1-score :  0.9432918395573997


In [54]:
printscores(ytest,testpred)

AUC: 0.8412343001084561
accuracy: 0.9445277361319341
recall  : 0.693069306930693
precision : 0.9210526315789473
f1-score :  0.7909604519774012


In [55]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=16)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['acclen', 'intplan', 'nummailmes', 'tdmin', 'tdcal', 'tdchar', 'temin',
       'tecal', 'tecahr', 'tnmin', 'tncal', 'tnchar', 'timin', 'tical',
       'tichar', 'ncsc'],
      dtype='object')

In [56]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,oob_score=True) #11, 13, 14
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [57]:
printscores(ytrain,trainpred)

AUC: 0.9515706806282722
accuracy: 0.9861215303825956
recall  : 0.9031413612565445
precision : 1.0
f1-score :  0.9491059147180193


In [58]:
printscores(ytest,testpred)

AUC: 0.8668701675821293
accuracy: 0.9535232383808095
recall  : 0.7425742574257426
precision : 0.9375
f1-score :  0.8287292817679558


In [59]:
pd.crosstab(ytest, testpred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,561,5,566
1,26,75,101
All,587,80,667


In [60]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=17)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['st', 'acclen', 'intplan', 'nummailmes', 'tdmin', 'tdcal', 'tdchar',
       'temin', 'tecal', 'tecahr', 'tnmin', 'tncal', 'tnchar', 'timin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [61]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=15,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [62]:
printscores(ytrain,trainpred)

AUC: 0.9777486910994764
accuracy: 0.9936234058514629
recall  : 0.9554973821989529
precision : 1.0
f1-score :  0.9772423025435074


In [63]:
printscores(ytest,testpred)

AUC: 0.8520186824336143
accuracy: 0.9490254872563718
recall  : 0.7128712871287128
precision : 0.935064935064935
f1-score :  0.8089887640449437


In [64]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=5)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['tdmin', 'tdchar', 'temin', 'tecahr', 'ncsc'], dtype='object')

In [65]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=11,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [66]:
printscores(ytrain,trainpred)

AUC: 0.8429319371727748
accuracy: 0.9549887471867967
recall  : 0.6858638743455497
precision : 1.0
f1-score :  0.813664596273292


In [67]:
printscores(ytest,testpred)

AUC: 0.7392243641325263
accuracy: 0.9025487256371814
recall  : 0.504950495049505
precision : 0.7727272727272727
f1-score :  0.6107784431137724


In [68]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=6)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'ncsc'], dtype='object')

In [69]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=13,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [70]:
printscores(ytrain,trainpred)

AUC: 0.8756544502617801
accuracy: 0.9643660915228808
recall  : 0.7513089005235603
precision : 1.0
f1-score :  0.8579970104633782


In [71]:
printscores(ytest,testpred)

AUC: 0.7549592415071897
accuracy: 0.9085457271364318
recall  : 0.5346534653465347
precision : 0.7941176470588235
f1-score :  0.6390532544378699


In [72]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=7)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'timin', 'ncsc'], dtype='object')

In [73]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=11,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [74]:
printscores(ytrain,trainpred)

AUC: 0.8638743455497382
accuracy: 0.9609902475618904
recall  : 0.7277486910994765
precision : 1.0
f1-score :  0.8424242424242425


In [75]:
printscores(ytest,testpred)

AUC: 0.7491253542315364
accuracy: 0.9055472263868066
recall  : 0.5247524752475248
precision : 0.7794117647058824
f1-score :  0.6272189349112427


In [76]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=8)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'timin',
       'ncsc'],
      dtype='object')

In [77]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=10,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [78]:
printscores(ytrain,trainpred)

AUC: 0.8952879581151832
accuracy: 0.9699924981245311
recall  : 0.7905759162303665
precision : 1.0
f1-score :  0.8830409356725146


In [79]:
printscores(ytest,testpred)

AUC: 0.7972133785816744
accuracy: 0.9250374812593704
recall  : 0.6138613861386139
precision : 0.8493150684931506
f1-score :  0.7126436781609196


In [80]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=12,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)

GridSearchCV(cv=5,
             estimator=RFE(estimator=DecisionTreeClassifier(max_depth=12,
                                                            random_state=42)),
             n_jobs=-1,
             param_grid={'n_features_to_select': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                  14]},
             return_train_score=True, scoring='recall')

In [81]:
cvdf12=pd.DataFrame(gridobj.cv_results_)
cvdf12[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.840249,0.499795
1,{'n_features_to_select': 6},0.89394,0.56039
2,{'n_features_to_select': 7},0.901155,0.662235
3,{'n_features_to_select': 8},0.90771,0.682878
4,{'n_features_to_select': 9},0.903789,0.69607
5,{'n_features_to_select': 10},0.903789,0.717088
6,{'n_features_to_select': 11},0.905096,0.727546
7,{'n_features_to_select': 12},0.905754,0.696241
8,{'n_features_to_select': 13},0.909022,0.696172
9,{'n_features_to_select': 14},0.912948,0.685748


In [82]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=12,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)

GridSearchCV(cv=5,
             estimator=RFE(estimator=DecisionTreeClassifier(max_depth=12,
                                                            random_state=42)),
             n_jobs=-1,
             param_grid={'n_features_to_select': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                  14]},
             return_train_score=True, scoring='recall')

In [83]:
cvdf12=pd.DataFrame(gridobj.cv_results_)
cvdf12[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.840249,0.499795
1,{'n_features_to_select': 6},0.89394,0.56039
2,{'n_features_to_select': 7},0.901155,0.662235
3,{'n_features_to_select': 8},0.90771,0.682878
4,{'n_features_to_select': 9},0.903789,0.69607
5,{'n_features_to_select': 10},0.903789,0.717088
6,{'n_features_to_select': 11},0.905096,0.727546
7,{'n_features_to_select': 12},0.905754,0.696241
8,{'n_features_to_select': 13},0.909022,0.696172
9,{'n_features_to_select': 14},0.912948,0.685748


In [84]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=13,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)

GridSearchCV(cv=5,
             estimator=RFE(estimator=DecisionTreeClassifier(max_depth=13,
                                                            random_state=42)),
             n_jobs=-1,
             param_grid={'n_features_to_select': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                  14]},
             return_train_score=True, scoring='recall')

In [85]:
cvdf13=pd.DataFrame(gridobj.cv_results_)
cvdf13[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.883527,0.51825
1,{'n_features_to_select': 6},0.901843,0.60147
2,{'n_features_to_select': 7},0.924067,0.635475
3,{'n_features_to_select': 8},0.921451,0.677649
4,{'n_features_to_select': 9},0.920144,0.701299
5,{'n_features_to_select': 10},0.923414,0.730212
6,{'n_features_to_select': 11},0.921451,0.719651
7,{'n_features_to_select': 12},0.923416,0.727512
8,{'n_features_to_select': 13},0.92015,0.706664
9,{'n_features_to_select': 14},0.928649,0.706596


In [86]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=14,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)

GridSearchCV(cv=5,
             estimator=RFE(estimator=DecisionTreeClassifier(max_depth=14,
                                                            random_state=42)),
             n_jobs=-1,
             param_grid={'n_features_to_select': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                  14]},
             return_train_score=True, scoring='recall')

In [87]:
cvdf14=pd.DataFrame(gridobj.cv_results_)
cvdf14[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.91097,0.507553
1,{'n_features_to_select': 6},0.927372,0.552051
2,{'n_features_to_select': 7},0.936492,0.640943
3,{'n_features_to_select': 8},0.937799,0.68028
4,{'n_features_to_select': 9},0.931917,0.714422
5,{'n_features_to_select': 10},0.931263,0.730212
6,{'n_features_to_select': 11},0.934544,0.714422
7,{'n_features_to_select': 12},0.929313,0.706733
8,{'n_features_to_select': 13},0.933893,0.711962
9,{'n_features_to_select': 14},0.935849,0.693643


In [88]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(estimator=tree.DecisionTreeClassifier(max_depth=15,random_state=42))
grid={"n_features_to_select":list(range(5,15))}
gridobj=model_selection.GridSearchCV(estimator=rfeobj,param_grid=grid,cv=5,scoring="recall",return_train_score=True,n_jobs=-1)
gridobj.fit(Xtrain,ytrain)

GridSearchCV(cv=5,
             estimator=RFE(estimator=DecisionTreeClassifier(max_depth=15,
                                                            random_state=42)),
             n_jobs=-1,
             param_grid={'n_features_to_select': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                  14]},
             return_train_score=True, scoring='recall')

In [89]:
cvdf15=pd.DataFrame(gridobj.cv_results_)
cvdf15[["params","mean_train_score","mean_test_score"]]

Unnamed: 0,params,mean_train_score,mean_test_score
0,{'n_features_to_select': 5},0.912963,0.541661
1,{'n_features_to_select': 6},0.955457,0.578195
2,{'n_features_to_select': 7},0.95745,0.609706
3,{'n_features_to_select': 8},0.941078,0.71702
4,{'n_features_to_select': 9},0.94108,0.709228
5,{'n_features_to_select': 10},0.937158,0.72758
6,{'n_features_to_select': 11},0.940424,0.724949
7,{'n_features_to_select': 12},0.939115,0.717054
8,{'n_features_to_select': 13},0.94174,0.714422
9,{'n_features_to_select': 14},0.948278,0.701367


In [90]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=12)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'nummailmes', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tnchar', 'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [91]:
grid1={"max_depth" : list(range(5,20))}
grid2={"max_depth": list(range(5,20)),"min_samples_split":list(range(5,30))}
paramgrid=[grid1,grid2]
obj=model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),param_grid=paramgrid,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]}],
             scoring='recall')

In [92]:
obj.best_params_

{'max_depth': 15, 'min_samples_split': 7}

In [93]:
obj.best_score_

0.748632946001367

In [94]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin', 'tnchar',
       'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [95]:
grid1={"max_depth" : list(range(5,20))}
grid2={"max_depth": list(range(5,20)),"min_samples_split":list(range(5,30))}
paramgrid=[grid1,grid2]
obj=model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),param_grid=paramgrid,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]}],
             scoring='recall')

In [96]:
obj.best_params_

{'max_depth': 15, 'min_samples_split': 7}

In [97]:
obj.best_score_

0.7093301435406698

In [98]:
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=10)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnchar', 'timin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [99]:
grid1={"max_depth" : list(range(5,20))}
grid2={"max_depth": list(range(5,20)),"min_samples_split":list(range(5,30))}
paramgrid=[grid1,grid2]
obj=model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),param_grid=paramgrid,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19]},
                         {'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                        16, 17, 18, 19],
                          'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12, 13,
                                                14, 15, 16, 17, 18, 19, 20, 21,
                                                22, 23, 24, 25, 26, 27, 28,
                                                29]}],
             scoring='recall')

In [100]:
obj.best_params_

{'max_depth': 14, 'min_samples_split': 6}

In [101]:
obj.best_score_

0.714524948735475

In [503]:
from sklearn import feature_selection
rfeobj=feature_selection.RFE(ensemble.RandomForestClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin', 'tnchar',
       'timin', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [116]:
grid1={"max_depth" : list(range(5,20))}
grid2={"max_depth": list(range(5,20)),"min_samples_split":list(range(5,20))}
grid3={ "min_samples_leaf" : list(range(5,20)),"min_samples_split" :list(range(5,20))}
paramgrid=[grid1,grid2,grid3]
obj=model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),param_grid=paramgrid,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)
obj.best_params_
print()
obj.best_score_

KeyboardInterrupt: 

In [119]:
rfobj=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=)
grid1={"max_features" : [0,9]}
paramgrid=[grid1]
obj=model_selection.GridSearchCV(estimator=rfobj,param_grid=grid1,cv=5,scoring="recall",n_jobs=-1)
obj.fit(Xtrain[impcols],ytrain)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=13, min_samples_split=6,
                                              random_state=42),
             n_jobs=-1, param_grid={'max_features': [0, 9]}, scoring='recall')

In [533]:
len(impcols)

11

In [560]:
rf=ensemble.RandomForestClassifier(random_state=42,n_estimators=100,max_depth=14,min_samples_split=14,oob_score=True) 
rf.fit(Xtrain[impcols],ytrain)
trainpred=rf.predict(Xtrain[impcols])
testpred=rf.predict(Xtest[impcols])

In [561]:
printscores(ytrain,trainpred)
print()
printscores(ytest,testpred)


AUC: 0.9031276074857191
accuracy: 0.9666166541635409
recall  : 0.8141361256544503
precision : 0.9452887537993921
f1-score :  0.8748241912798875

AUC: 0.8210824616030509
accuracy: 0.9310344827586207
recall  : 0.6633663366336634
precision : 0.8481012658227848
f1-score :  0.7444444444444445


In [292]:
pd.crosstab(ytest, testp, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,555,11,566
1,26,75,101
All,581,86,667


In [106]:
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=10)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tical', 'tichar', 'ncsc'],
      dtype='object')

In [107]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=6) #6
model.fit(Xtrain[impcols],ytrain)
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [108]:
printscores(ytrain,trainp)
print()
printscores(ytest,testp)

AUC: 0.920152483472552
accuracy: 0.9752438109527382
recall  : 0.8429319371727748
precision : 0.9817073170731707
f1-score :  0.9070422535211267

AUC: 0.8700538781793373
accuracy: 0.952023988005997
recall  : 0.7524752475247525
precision : 0.9156626506024096
f1-score :  0.826086956521739


In [109]:
rfeobj=feature_selection.RFE(tree.DecisionTreeClassifier(random_state=42),n_features_to_select=11)
rfeobj.fit(Xtrain,ytrain)
impcols=Xtrain.columns[rfeobj.support_] 
impcols

Index(['intplan', 'voice', 'tdmin', 'tdchar', 'temin', 'tecahr', 'tnmin',
       'tnchar', 'tical', 'tichar', 'ncsc'],
      dtype='object')

In [110]:
model=tree.DecisionTreeClassifier(random_state=42,max_depth=18,min_samples_split=3) #6
model.fit(Xtrain[impcols],ytrain)
trainp=model.predict(Xtrain[impcols])
testp=model.predict(Xtest[impcols])

In [111]:
printscores(ytrain,trainp)
print()
printscores(ytest,testp)

AUC: 0.9646596858638743
accuracy: 0.9898724681170292
recall  : 0.9293193717277487
precision : 1.0
f1-score :  0.9633649932157395

AUC: 0.8665203092747437
accuracy: 0.9460269865067467
recall  : 0.7524752475247525
precision : 0.8735632183908046
f1-score :  0.8085106382978724
