In [61]:
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

df_contract = pd.read_csv('ContractData.csv')
df_calls_data = pd.read_excel('CallsData.xls')


In [32]:
df_contract

Unnamed: 0,Account Length,Churn,Int'l Plan,VMail Plan,State,Area Code,Phone
0,128,0,0,1,KS,415,382-4657
1,107,0,0,1,OH,415,371-7191
2,137,0,0,0,NJ,415,358-1921
3,84,0,1,0,OH,408,375-9999
4,75,0,1,0,OK,415,330-6626
...,...,...,...,...,...,...,...
3328,192,0,0,1,AZ,415,414-4276
3329,68,0,0,0,WV,415,370-3271
3330,28,0,0,0,RI,510,328-8230
3331,184,0,1,0,CT,510,364-6381


In [33]:
#merging dataset
df = pd.merge(df_contract ,df_calls_data, how='inner', on = ['Area Code','Phone'])
df

Unnamed: 0,Account Length,Churn,Int'l Plan,VMail Plan,State,Area Code,Phone,VMail Message,Day Mins,Eve Mins,...,Intl Mins,CustServ Calls,Day Calls,Day Charge,Eve Calls,Eve Charge,Night Calls,Night Charge,Intl Calls,Intl Charge
0,128,0,0,1,KS,415,382-4657,25,265.1,197.4,...,10.0,1,110,45.07,99,16.78,91,11.01,3,2.70
1,107,0,0,1,OH,415,371-7191,26,161.6,195.5,...,13.7,1,123,27.47,103,16.62,103,11.45,3,3.70
2,137,0,0,0,NJ,415,358-1921,0,243.4,121.2,...,12.2,0,114,41.38,110,10.30,104,7.32,5,3.29
3,84,0,1,0,OH,408,375-9999,0,299.4,61.9,...,6.6,2,71,50.90,88,5.26,89,8.86,7,1.78
4,75,0,1,0,OK,415,330-6626,0,166.7,148.3,...,10.1,3,113,28.34,122,12.61,121,8.41,3,2.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,0,0,1,AZ,415,414-4276,36,156.2,215.5,...,9.9,2,77,26.55,126,18.32,83,12.56,6,2.67
3329,68,0,0,0,WV,415,370-3271,0,231.1,153.4,...,9.6,3,57,39.29,55,13.04,123,8.61,4,2.59
3330,28,0,0,0,RI,510,328-8230,0,180.8,288.8,...,14.1,2,109,30.74,58,24.55,91,8.64,6,3.81
3331,184,0,1,0,CT,510,364-6381,0,213.8,159.6,...,5.0,2,105,36.35,84,13.57,137,6.26,10,1.35


In [25]:
#churn to nominal
df['Churn'] = df['Churn'].astype('category')
df.dtypes

Account Length       int64
Churn             category
Int'l Plan           int64
VMail Plan           int64
State               object
Area Code            int64
Phone               object
VMail Message        int64
Day Mins           float64
Eve Mins           float64
Night Mins         float64
Intl Mins          float64
CustServ Calls       int64
Day Calls            int64
Day Charge         float64
Eve Calls            int64
Eve Charge         float64
Night Calls          int64
Night Charge       float64
Intl Calls           int64
Intl Charge        float64
dtype: object

In [62]:
X = df.drop(['Churn'],axis=1)
y = df['Churn'].to_frame()

In [35]:
y

Unnamed: 0,Churn
0,0
1,0
2,0
3,0
4,0
...,...
3328,0
3329,0
3330,0
3331,0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2021)

In [37]:
X_train = X_train.drop(['State','Phone'],axis=1)
X_test = X_test.drop(['State','Phone'],axis=1)

In [38]:
cls = DecisionTreeClassifier(random_state=2021)

In [39]:
cls.fit(X_train,y_train)

DecisionTreeClassifier(random_state=2021)

In [40]:
predictions = cls.predict(X_test)
predictions

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [41]:
#Accuracy
score = accuracy_score(y_test, predictions)
score

0.920863309352518

In [42]:
precision_score(y_test, predictions)

0.6896551724137931

In [14]:
recall_score(y_test, predictions)

0.7272727272727273

In [51]:
confusion_matrix(y_test,predictions)

array([[688,  36],
       [ 30,  80]], dtype=int64)

In [50]:
# f1 macro
f1_score(y_test,predictions)

0.7079646017699116

In [63]:
print("Using a decision tree with cross_val_score (mean accuracy)...")
X = X.drop(['Phone','State'],axis=1)
cls = DecisionTreeClassifier(criterion='gini',max_depth = 10 , random_state = 2021)
scores = cross_val_score(cls,X,y, cv = 10)
print(scores)
print("Result: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Using a decision tree with cross_val_score (mean accuracy)...
[0.92814371 0.94610778 0.93413174 0.92792793 0.94894895 0.94894895
 0.93393393 0.93693694 0.93393393 0.95195195]
Result: 0.94 accuracy with a standard deviation of 0.01
