# Decision Tree Classifier #

## 1 - Importing packages and data ##

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from statsmodels.graphics.mosaicplot import mosaic

In [6]:
df = pd.read_csv("../datasets/telecom_churn_clean.csv", index_col = 0)
df.head()

Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [None]:
print(df.shape)

## 2 - Preparing data ##

### 2.1 - OneHot Encoding categorical variables ###

In [None]:
df_dummies = pd.get_dummies(df[' Insert categorical variable '], drop_first = True).astype('int')
df = pd.concat([df, df_dummies], axis = 1)
df = df.drop(' Insert categorical variable ', axis = 1)

### 2.2 - Separating X and y ###

In [7]:
X = df.drop('churn', axis = 1).values
y = df['churn'].values

### 2.3 - Separating train and test samples ###

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
dt = DecisionTreeClassifier(max_depth = 2, random_state = 11)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

0.881559220389805

In [12]:
params_dt = {"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
		"min_samples_leaf": [0.04, 0.06, 0.08],
		"max_features" : [0.2, 0.4, 0.6, 0.8]}

In [13]:
dt = DecisionTreeClassifier(random_state = 11)

In [27]:
grid_dt = GridSearchCV(estimator = dt, param_grid = params_dt, scoring = 'accuracy', cv = 10, n_jobs = -1)

In [24]:
grid_dt.fit(X_train, y_train)

In [25]:
print(grid_dt.best_params_, grid_dt.best_score_)

{'max_depth': 4, 'max_features': 0.2, 'min_samples_leaf': 0.04} 0.5296221322537112


In [26]:
best_model = grid_dt.best_estimator_
test_acc = best_model.score(X_test, y_test)
print(test_acc)

0.8920539730134932


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 11)
dt = DecisionTreeClassifier(max_depth = 2, random_state = 11)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

0.8725637181409296