In [3]:

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks
import warnings
import os
warnings.filterwarnings('ignore')

In [4]:
os.getcwd()

'/Users/Kumar/Desktop/lab-handling-data-imbalance-classification'

In [5]:
churn_data = pd.read_csv(r"files_for_lab/Customer-Churn.csv")
churn_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [6]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [None]:
#lets convert the column names into lower case 

In [8]:
col_names = churn_data.columns.values.tolist()
for col in range(len(col_names)):
    col_names[col]=col_names[col].lower()
    col_names[col]=col_names[col].replace(" ", "_")

In [11]:
for i in range(len(col_names)):
    churn_data.rename(columns={churn_data.columns.values[i]:col_names[i]},inplace=True)

In [13]:
churn_data.head(2)

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No


In [14]:
churn_data['totalcharges'].replace(" ", np.nan, inplace=True)

In [16]:
churn_data["totalcharges"]=pd.to_numeric(churn_data["totalcharges"])

In [17]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   onlinesecurity    7043 non-null   object 
 7   onlinebackup      7043 non-null   object 
 8   deviceprotection  7043 non-null   object 
 9   techsupport       7043 non-null   object 
 10  streamingtv       7043 non-null   object 
 11  streamingmovies   7043 non-null   object 
 12  contract          7043 non-null   object 
 13  monthlycharges    7043 non-null   float64
 14  totalcharges      7032 non-null   float64
 15  churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [18]:
churn_data.isnull().sum()

gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [19]:
median= churn_data["totalcharges"].median()# we are going to replace de null values for the median

In [20]:
median

1397.475

In [21]:
churn_data["totalcharges"]=churn_data["totalcharges"].fillna(median) # we replace the nulls for the median value

In [22]:
churn_data.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [23]:
cols_to_scale=["tenure", "seniorcitizen", "monthlycharges","totalcharges"]

In [24]:
churn_scale= churn_data[cols_to_scale]

In [26]:
churn_data["churn"].value_counts()#the data is imbalaved so lets use the standard scaler technique

No     5174
Yes    1869
Name: churn, dtype: int64

In [28]:
scaler=StandardScaler()
scaled=scaler.fit_transform(churn_scale)
churn_scaled=pd.DataFrame(scaled,columns=cols_to_scale)
churn_scaled

Unnamed: 0,tenure,seniorcitizen,monthlycharges,totalcharges
0,-1.277445,-0.439916,-1.160323,-0.994242
1,0.066327,-0.439916,-0.259629,-0.173244
2,-1.236724,-0.439916,-0.362660,-0.959674
3,0.514251,-0.439916,-0.746535,-0.194766
4,-1.236724,-0.439916,0.197365,-0.940470
...,...,...,...,...
7038,-0.340876,-0.439916,0.665992,-0.128655
7039,1.613701,-0.439916,1.277533,2.243151
7040,-0.870241,-0.439916,-1.168632,-0.854469
7041,-1.155283,2.273159,0.320338,-0.872062


In [None]:
#let do x, y split to fit a regression model

In [29]:
X=churn_scaled
y=churn_data["churn"]

In [30]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [31]:
model=LogisticRegression()
model.fit(X_train,y_train)

In [32]:
y_pred=model.predict(X_test)


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.68      0.70      2113
weighted avg       0.78      0.79      0.78      2113



In [None]:
#now lets try to use data balacing techniques such as smote(over sampling and down sampling)

In [34]:
smote = SMOTE()
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X, y, test_size=0.3, random_state=42)
X_res_sm, y_res_sm = smote.fit_resample(X_train_sm, y_train_sm)
y_res_sm.value_counts()

No     3635
Yes    3635
Name: churn, dtype: int64

In [None]:
#now lets try print the classification report and examine the results

In [35]:

classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_res_sm, y_res_sm)

y_sm_predictions = classification.predict(X_test_sm)
print(classification_report(y_test_sm, y_sm_predictions))

              precision    recall  f1-score   support

          No       0.89      0.73      0.80      1539
         Yes       0.51      0.76      0.61       574

    accuracy                           0.74      2113
   macro avg       0.70      0.75      0.71      2113
weighted avg       0.79      0.74      0.75      2113



In [None]:
#when we look at results form the previous results
#Accuracy metric decreased
#Regrading no the precision increased while recall and the f1-score have decreased.
#For the prediction of the "Yes" values, the effect of the SMOTE has been  (decrease of precision, recall and f1-score)

In [None]:
#lets try other technique for data  imbalance data


In [51]:
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X, y, test_size=0.3, random_state=42)

In [55]:
tomek = TomekLinks()
X_res_tl, y_res_tl = tomek.fit_resample(X_train_tl, y_train_tl)
y_res_tl.value_counts()

No     3264
Yes    1295
Name: churn, dtype: int64

In [38]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_res_tl, y_res_tl)

y_tl_predictions = classification.predict(X_test_tl)
print(classification_report(y_test_tl, y_tl_predictions))

              precision    recall  f1-score   support

          No       0.83      0.88      0.86      1539
         Yes       0.62      0.52      0.57       574

    accuracy                           0.78      2113
   macro avg       0.73      0.70      0.71      2113
weighted avg       0.77      0.78      0.78      2113



In [None]:
#if look at the model the metrics for no have not improved but the metrics for yes(accuracy ,precision)improved 
#in order to get better results we need to first balnce the dataset and then apply the model.