In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.over_sampling import SMOTE  # imblearn library can be installed in Visual Studio by going into Python Environment -> Install new package -> imblearn package
from sklearn.ensemble import RandomForestClassifier

# Importing dataset 

In [None]:
dataset = pd.read_csv("ChurnPrediction.csv")

# Header of Dataset

In [None]:
dataset.head()

# Shape of Dataset

In [None]:
dataset.shape

# Information of given dataset

In [None]:
dataset.info()

# This shows that their are total 1470 entries with zero null values.Different coloumn have datatype. So we have to pre-process the data.

# Describe function will give the max,min,sd of all the attributes also different Q ranges

In [None]:
dataset.describe()

# unique values in specific coloumns

In [None]:
dataset['PastEmployee'].unique()

In [None]:
dataset['BusinessTravel'].unique()

In [None]:
dataset['Department'].unique()

In [None]:
dataset['EducationField'].unique()

In [None]:
dataset['Gender'].unique()

In [None]:
dataset['JobRole'].unique()

In [None]:
dataset['MaritalStatus'].unique()

In [None]:
dataset['OverTime'].unique()

# Converting Categorical features into Numerical features

In [None]:
# defining a converter function that can convert yes and no values in coloumns into 1 and 0 respectively.
def converter(column):
    if column == 'Yes':
        return 1
    else:
        return 0

In [None]:
dataset['PastEmployee'] = dataset['PastEmployee'].apply(converter)
dataset['OverTime'] = dataset['OverTime'].apply(converter)

# Creating dummies

In [None]:
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']
final_data = pd.get_dummies(dataset, columns = categorical_features)

# Information about dataset after following changes

In [None]:
final_data.info()

# This shows that there is no categorical data in the above dataset

# Dividing dataset into label and feature sets

In [None]:
X = final_data.drop('PastEmployee', axis = 1) # Features set which will store only the features coloumns and drop all the unwanted coloumn, also we need to drop label data or target coloumn from the given dataset
Y = final_data['PastEmployee'] # Labels set which holds the target coloumn and their should always be only one coloumn in label set.
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

X.shape shows that their are 1470 rows and 46 coloumns whereas in Y their are one coloumn with 1470 rows.

# Normalizing numerical features so that each feature has mean 0 and variance 1

In [None]:
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Dividing dataset into training and test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size = 0.3, random_state = 100)
#test size = 0.3 means that 30% od data will be treated as test data and remaining 70% as training data
#random_state= 100 is set so that we ever we run the code we will get same random number.

In [None]:
print(X_train.shape)
print(X_test.shape)

out off 1470 -> 1029 is taken as training set and 441 as test set

# Implementing Oversampling to balance the dataset; SMOTE stands for Synthetic Minority Oversampling Technique

In [None]:
print("Number of observations in each class before oversampling (training data): \n", pd.Series(Y_train).value_counts())

smote = SMOTE(random_state = 101)
X_train,Y_train = smote.fit_sample(X_train,Y_train)

print("Number of observations in each class after oversampling (training data): \n", pd.Series(Y_train).value_counts())

# Building Classification Decision Tree Model

In [None]:
dtree = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
dtree.fit(X_train, Y_train)
featimp = pd.Series(dtree.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)

Evatuation is done for 5 depth. 0 value is because they are not taken into consideration.

In [None]:
# Evaluating Decision Tree Model
Y_pred = dtree.predict(X_test)
print("Prediction Accuracy: ", metrics.accuracy_score(Y_test, Y_pred)) # Not a good idea coz imbalanced test set
conf_mat = metrics.confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_mat,annot=True)
plt.title("Confusion_matrix")
plt.xlabel("Predicted Class")
plt.ylabel("Actual class")
plt.show()
print('Confusion matrix: \n', conf_mat)
print('TP: ', conf_mat[1,1])
print('TN: ', conf_mat[0,0])
print('FP: ', conf_mat[0,1])
print('FN: ', conf_mat[1,0])

# Tuning the tree size parameter 'max_depth' and implementing cross-validation using Grid Search

In [None]:
rfc = RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1)
grid_param = {'n_estimators': [200, 250, 300, 350, 400, 450]}

gd_sr = GridSearchCV(estimator=rfc, param_grid=grid_param, scoring='recall', cv=5)

In [None]:
gd_sr.fit(X_train, Y_train)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

# Building random forest using the tuned parameter

In [None]:
rfc = RandomForestClassifier(n_estimators=400, criterion='entropy', max_features='auto', random_state=1)
rfc.fit(X_train,Y_train)
featimp = pd.Series(rfc.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)

Y_pred = rfc.predict(X_test)
conf_mat = metrics.confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_mat,annot=True)
plt.title("Confusion_matrix")
plt.xlabel("Predicted Class")
plt.ylabel("Actual class")
plt.show()
print('Confusion matrix: \n', conf_mat)
print('TP: ', conf_mat[1,1])
print('TN: ', conf_mat[0,0])
print('FP: ', conf_mat[0,1])
print('FN: ', conf_mat[1,0])

# Selecting features with higher sifnificance and redefining feature set

In [None]:
X = final_data[['OverTime', 'Age', 'JobSatisfaction', 'StockOptionLevel', 'MonthlyIncome']]

feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Dividing dataset into training and test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size = 0.3, random_state = 100)

smote = SMOTE(random_state = 101)
X_train,Y_train = smote.fit_sample(X_train,Y_train)

rfc = RandomForestClassifier(n_estimators=400, criterion='entropy', max_features='auto', random_state=1)
rfc.fit(X_train,Y_train)

Y_pred = rfc.predict(X_test)
conf_mat = metrics.confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_mat,annot=True)
plt.title("Confusion_matrix")
plt.xlabel("Predicted Class")
plt.ylabel("Actual class")
plt.show()
print('Confusion matrix: \n', conf_mat)
print('TP: ', conf_mat[1,1])
print('TN: ', conf_mat[0,0])
print('FP: ', conf_mat[0,1])
print('FN: ', conf_mat[1,0])