### Importing the libraries

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import math
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
from sklearn import preprocessing


### Reading the file

In [2]:
df = pd.read_csv("Customer-Churn.csv")
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [4]:

numerical_data=df.select_dtypes(include=np.number)
categorical_data=df.select_dtypes(include='object')

from sklearn.impute import KNNImputer
knn=KNNImputer()
new_numerical_data=knn.fit_transform(numerical_data)
numerical_data=pd.DataFrame(new_numerical_data,columns=numerical_data.columns)
df=pd.merge(categorical_data,numerical_data,left_index=True,right_index=True)
df

Unnamed: 0,gender,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,TotalCharges,Churn,SeniorCitizen,tenure,MonthlyCharges
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,29.85,No,0.0,1.0,29.85
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,1889.5,No,0.0,34.0,56.95
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,108.15,Yes,0.0,2.0,53.85
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,1840.75,No,0.0,45.0,42.30
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,151.65,Yes,0.0,2.0,70.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,1990.5,No,0.0,24.0,84.80
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,7362.9,No,0.0,72.0,103.20
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,346.45,No,0.0,11.0,29.60
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,306.6,Yes,1.0,4.0,74.40


### Changing the datatype of the column "TotalCharges"

In [5]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors ="coerce")


### Replacing nans of the TotalCharges column with mean

In [6]:
df['TotalCharges']=df['TotalCharges'].replace(np.NaN,df['TotalCharges'].mean())

In [7]:
df.isna().sum()

gender              0
Partner             0
Dependents          0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
TotalCharges        0
Churn               0
SeniorCitizen       0
tenure              0
MonthlyCharges      0
dtype: int64

### Dummification

In [8]:
from sklearn.preprocessing import OneHotEncoder
encoder = encoder = OneHotEncoder(drop='first').fit(categorical_data)

encoded_categoricals = encoder.transform(categorical_data).toarray()
encoded_categoricals = pd.DataFrame(encoded_categoricals)

In [9]:
X = pd.concat([numerical_data, encoded_categoricals], axis = 1)

### X-Y Split

In [10]:
y = df['Churn']
X = df[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]

In [11]:
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [12]:
X.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
TotalCharges      0
dtype: int64

### Checking for the Imbalance of the data

In [13]:
y.value_counts().to_frame().T

Unnamed: 0,No,Yes
Churn,5174,1869


### Split Train and Test data 

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=42)

### Using SMOTE technique to balance the data

In [15]:
from imblearn.over_sampling import SMOTE
x_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

In [16]:
y_train_resampled.value_counts().to_frame().T

Unnamed: 0,No,Yes
Churn,3635,3635


### LOGISTIC REGRESSION

In [17]:
logreg = LogisticRegression(random_state = 42)

### Fitting the model with data

In [18]:
logreg.fit(x_train_resampled, y_train_resampled)

LogisticRegression(random_state=42)

In [19]:
y_pred = logreg.predict(X_test)

### Classification Report

In [20]:
target_names = ['Churn', 'Not Churn']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Churn       0.90      0.71      0.79      1539
   Not Churn       0.50      0.79      0.62       574

    accuracy                           0.73      2113
   macro avg       0.70      0.75      0.70      2113
weighted avg       0.79      0.73      0.75      2113



In [21]:
print("Train Score : {}".format(logreg.score(x_train_resampled, y_train_resampled)))

Train Score : 0.7382393397524072


### DECISION TREE

In [22]:
classifier = DecisionTreeClassifier(min_samples_leaf=50)
classifier.fit(x_train_resampled, y_train_resampled)

DecisionTreeClassifier(min_samples_leaf=50)

In [23]:
classifier.get_depth()

14

In [24]:
y_pred = classifier.predict(X_test)

In [25]:
accuracy_train = classifier.score(x_train_resampled, y_train_resampled)
print(f"Accuracy on the training set: {accuracy_train:.0%}")

Accuracy on the training set: 79%


### Classification Report

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.86      0.77      0.81      1539
         Yes       0.52      0.67      0.59       574

    accuracy                           0.74      2113
   macro avg       0.69      0.72      0.70      2113
weighted avg       0.77      0.74      0.75      2113



In [27]:
y_classifier_probabilities = classifier.predict_proba(x_train_resampled)

### Logistic Regression after TomekLinks

In [28]:
t = TomekLinks()
# transform the dataset
X_train_tomek,y_train_tomek = t.fit_resample(X_train,y_train)

In [29]:
y_train_tomek.value_counts().to_frame().T

Unnamed: 0,No,Yes
Churn,3217,1295


In [30]:
logreg.fit(X_train_tomek, y_train_tomek)

LogisticRegression(random_state=42)

In [31]:
print("Train Score : {}".format(logreg.score(X_train_tomek,y_train_tomek)))

Train Score : 0.7898936170212766


### Decision Tree after TomekLinks

In [32]:
classifier = DecisionTreeClassifier(min_samples_leaf=50)
classifier.fit(X_train_tomek,y_train_tomek)

DecisionTreeClassifier(min_samples_leaf=50)

In [33]:
accuracy_train = classifier.score(X_train_tomek,y_train_tomek)
print(f"Accuracy on the training set: {accuracy_train:.0%}")

Accuracy on the training set: 81%
