In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
churnData=pd.read_csv("Customer_churn.csv")
churnData.drop('customerID', axis=1, inplace=True)
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
churnData['Churn'].value_counts() # As we can see, the data is imbalanced. 

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
col_names=churnData.columns.values.tolist()
for col in range(len(col_names)):
    col_names[col]=col_names[col].lower()
    col_names[col]=col_names[col].replace(" ", "_")

In [5]:
for i in range(len(col_names)):
    churnData.rename(columns={churnData.columns.values[i]:col_names[i]},inplace=True)

In [6]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   multiplelines     7043 non-null   object 
 7   internetservice   7043 non-null   object 
 8   onlinesecurity    7043 non-null   object 
 9   onlinebackup      7043 non-null   object 
 10  deviceprotection  7043 non-null   object 
 11  techsupport       7043 non-null   object 
 12  streamingtv       7043 non-null   object 
 13  streamingmovies   7043 non-null   object 
 14  contract          7043 non-null   object 
 15  paperlessbilling  7043 non-null   object 
 16  paymentmethod     7043 non-null   object 


In [7]:
churnData = churnData[churnData.totalcharges != " "]

In [8]:
churnData["totalcharges"].value_counts()

20.2      11
19.75      9
20.05      8
19.9       8
19.65      8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: totalcharges, Length: 6530, dtype: int64

In [9]:
churnData['totalcharges'] = churnData['totalcharges'].astype(float)

In [10]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   seniorcitizen     7032 non-null   int64  
 2   partner           7032 non-null   object 
 3   dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   phoneservice      7032 non-null   object 
 6   multiplelines     7032 non-null   object 
 7   internetservice   7032 non-null   object 
 8   onlinesecurity    7032 non-null   object 
 9   onlinebackup      7032 non-null   object 
 10  deviceprotection  7032 non-null   object 
 11  techsupport       7032 non-null   object 
 12  streamingtv       7032 non-null   object 
 13  streamingmovies   7032 non-null   object 
 14  contract          7032 non-null   object 
 15  paperlessbilling  7032 non-null   object 
 16  paymentmethod     7032 non-null   object 


In [11]:
df_num = churnData.select_dtypes(include = np.number)
df_cat = churnData.select_dtypes(include = object)

In [12]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   seniorcitizen   7032 non-null   int64  
 1   tenure          7032 non-null   int64  
 2   monthlycharges  7032 non-null   float64
 3   totalcharges    7032 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 274.7 KB


In [13]:
scaler=StandardScaler()

In [14]:
scaled=scaler.fit_transform(df_num)

In [15]:
df_num = pd.DataFrame(scaled,columns=df_num.columns)

In [16]:
df_num.head(3)

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,-0.440327,-1.280248,-1.161694,-0.994194
1,-0.440327,0.064303,-0.260878,-0.17374
2,-0.440327,-1.239504,-0.363923,-0.959649


In [17]:
#Let's drop some useless columns
df_cat = churnData.select_dtypes(include = 'object')
df_cat.head(2)

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,churn
0,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No


In [18]:
df_cat = df_cat.drop(['churn'], axis=1)
df_cat.head(2)

Unnamed: 0,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod
0,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check


In [19]:
#One Hot/Label Encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(df_cat)
encoded = encoder.transform(df_cat).toarray()
cols = encoder.get_feature_names(input_features=df_cat.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded#.head()

Unnamed: 0,gender_Male,partner_Yes,dependents_Yes,phoneservice_Yes,multiplelines_No phone service,multiplelines_Yes,internetservice_Fiber optic,internetservice_No,onlinesecurity_No internet service,onlinesecurity_Yes,...,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No internet service,streamingmovies_Yes,contract_One year,contract_Two year,paperlessbilling_Yes,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7028,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
7029,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7030,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
X_concat=np.concatenate([df_num,encoded], axis=1)
y= churnData['churn']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.3, random_state=42)

In [22]:
#lets use smote tecnique to balance the dataset
smote=SMOTE()

In [23]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
y_train_sm.value_counts()

Yes    3614
No     3614
Name: churn, dtype: int64

In [24]:
#Logistic regression
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train_sm, y_train_sm)

y_sm_predictions = classification.predict(X_test)
print(classification_report(y_test, y_sm_predictions))


              precision    recall  f1-score   support

          No       0.91      0.73      0.81      1549
         Yes       0.52      0.79      0.63       561

    accuracy                           0.75      2110
   macro avg       0.71      0.76      0.72      2110
weighted avg       0.80      0.75      0.76      2110



In [25]:
#lets use the decision tree and compare the results
clf_model = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=4, min_samples_leaf=5)   
clf_model.fit(X_train_sm,y_train_sm)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=5, random_state=42)

In [26]:
y_predict_dt = clf_model.predict(X_test)

In [27]:
accuracy_score(y_test,y_predict_dt)

0.7279620853080568

In [28]:
print(classification_report(y_test, y_predict_dt))

              precision    recall  f1-score   support

          No       0.90      0.71      0.79      1549
         Yes       0.49      0.78      0.60       561

    accuracy                           0.73      2110
   macro avg       0.70      0.74      0.70      2110
weighted avg       0.79      0.73      0.74      2110



in both the models the results are similar but we have better resuts in logistic regression

In [29]:
#tomelinks downsampling method
tomek = TomekLinks()

In [30]:
X_train_tl,y_train_tl=tomek.fit_resample(X_train,y_train)

In [31]:
y_train_tl.value_counts()

No     3296
Yes    1308
Name: churn, dtype: int64

In [32]:
classification_lr=LogisticRegression(random_state=42, max_iter=10000)
classification_lr.fit(X_train_tl,y_train_tl)

LogisticRegression(max_iter=10000, random_state=42)

In [33]:
y_pred_tl=classification_lr.predict(X_test)

In [34]:
accuracy_score(y_test,y_pred_tl)

0.7900473933649289

In [35]:
print(classification_report(y_test, y_pred_tl))

              precision    recall  f1-score   support

          No       0.86      0.85      0.86      1549
         Yes       0.60      0.61      0.61       561

    accuracy                           0.79      2110
   macro avg       0.73      0.73      0.73      2110
weighted avg       0.79      0.79      0.79      2110



In [36]:
clf_model_tl = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=3, min_samples_leaf=5) 
clf_model_tl.fit(X_train_tl,y_train_tl)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=42)

In [37]:
y_predict_dt_tl = clf_model.predict(X_test)

In [38]:
accuracy_score(y_test,y_predict_dt_tl)

0.7279620853080568

In [39]:
print(classification_report(y_test, y_predict_dt_tl))

              precision    recall  f1-score   support

          No       0.90      0.71      0.79      1549
         Yes       0.49      0.78      0.60       561

    accuracy                           0.73      2110
   macro avg       0.70      0.74      0.70      2110
weighted avg       0.79      0.73      0.74      2110



from using the two modes we can see that when predictic yes values logistic regression has better f1 score and recall values 