## 8-5 Lab | Handling Data Imbalance in Classification Models

- Import the required libraries and modules that you would need.

In [300]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

- Read that data into Python and call the dataframe `churnData`

In [301]:
churnData = pd.read_csv('Customer-Churn.txt')

In [302]:
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


- Check the datatypes of all the columns in the data. You would see that the column `TotalCharges` is object type. Convert this column into numeric type using `pd.to_numeric` function

In [303]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [304]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [305]:
print(churnData['TotalCharges'].dtype)

float64


- Check for null values in the dataframe. Replace the null values.

In [306]:
nulls_percent_df = pd.DataFrame(churnData.isna().sum()/len(churnData)).reset_index()
nulls_percent_df.columns = ['column_name', 'nulls_percentage']
nulls_percent_df

Unnamed: 0,column_name,nulls_percentage
0,gender,0.0
1,SeniorCitizen,0.0
2,Partner,0.0
3,Dependents,0.0
4,tenure,0.0
5,PhoneService,0.0
6,OnlineSecurity,0.0
7,OnlineBackup,0.0
8,DeviceProtection,0.0
9,TechSupport,0.0


In [307]:
#As there are just a few null values in Total Charges I will imput the mean 
Total_changes_mean = np.mean(churnData['TotalCharges'])
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(Total_changes_mean)

- Use the following features: `tenure`, `SeniorCitizen`, `MonthlyCharges` and `TotalCharges`:
  - Scale the features either by using normalizer or a standard scaler.

In [308]:
from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import StandardScaler

In [309]:
#I will scale the selected features:
columns_to_scale = ['tenure','SeniorCitizen','MonthlyCharges', 'TotalCharges']

In [310]:
#setting the X with the selected features from the df:
X = churnData[columns_to_scale]

In [311]:
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.5
2,2,0,53.85,108.15
3,45,0,42.3,1840.75
4,2,0,70.7,151.65


In [312]:
#Initializing and fitting the specific columns
transformer=MinMaxScaler().fit(X)
transformer
x_scaled=transformer.transform(X)

In [313]:
#Checking result:
print(x_scaled.shape)
pd.DataFrame(x_scaled)

(7043, 4)


Unnamed: 0,0,1,2,3
0,0.013889,0.0,0.115423,0.001275
1,0.472222,0.0,0.385075,0.215867
2,0.027778,0.0,0.354229,0.010310
3,0.625000,0.0,0.239303,0.210241
4,0.027778,0.0,0.521891,0.015330
...,...,...,...,...
7038,0.333333,0.0,0.662189,0.227521
7039,1.000000,0.0,0.845274,0.847461
7040,0.152778,0.0,0.112935,0.037809
7041,0.055556,1.0,0.558706,0.033210


In [314]:
#re-asigning the column names back
scaled_df = pd.DataFrame(x_scaled, columns=columns_to_scale)

In [315]:
scaled_df

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,0.013889,0.0,0.115423,0.001275
1,0.472222,0.0,0.385075,0.215867
2,0.027778,0.0,0.354229,0.010310
3,0.625000,0.0,0.239303,0.210241
4,0.027778,0.0,0.521891,0.015330
...,...,...,...,...
7038,0.333333,0.0,0.662189,0.227521
7039,1.000000,0.0,0.845274,0.847461
7040,0.152778,0.0,0.112935,0.037809
7041,0.055556,1.0,0.558706,0.033210


In [316]:
# Combine with non-scaled columns
result_df = churnData.drop(columns_to_scale, axis=1).join(scaled_df)


In [317]:
result_df.head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,Churn,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,No,0.013889,0.0,0.115423,0.001275
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,No,0.472222,0.0,0.385075,0.215867
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,Yes,0.027778,0.0,0.354229,0.01031
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,No,0.625,0.0,0.239303,0.210241
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,Yes,0.027778,0.0,0.521891,0.01533


  - Split the data into a training set and a test set.

In [318]:
#The purpose of the model is to identify customers that are more likely to default/churn 

In [319]:
result_df.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [320]:
#Encoding the variable Churn (the target variable)
dummies = pd.get_dummies(result_df['Churn'])

In [321]:
dummies = dummies.astype(int)

In [322]:
churnData = pd.concat([result_df, dummies], axis=1)

In [323]:
churnData = churnData.drop(['Churn','No'], axis=1)

In [324]:
churnData

Unnamed: 0,gender,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Yes
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,0.013889,0.0,0.115423,0.001275,0
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,0.472222,0.0,0.385075,0.215867,0
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,0.027778,0.0,0.354229,0.010310,1
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,0.625000,0.0,0.239303,0.210241,0
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,0.027778,0.0,0.521891,0.015330,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,0.333333,0.0,0.662189,0.227521,0
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,1.000000,0.0,0.845274,0.847461,0
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,0.152778,0.0,0.112935,0.037809,0
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,0.055556,1.0,0.558706,0.033210,1


In [325]:
churnData = churnData.rename(columns={'Yes': 'Churn'})

In [326]:
#Now, splitting the data into train and test:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X=scaled_df
y=churnData['Churn']

In [327]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  - Fit a logistic regression model on the training data.

In [328]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

In [329]:
logreg.score(X_test, y_test)

0.8034066713981547

In [330]:
y_pred = logreg.predict(X_test)

  - Check the accuracy on the test data.

In [331]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8034066713981547
Confusion Matrix:
[[959  77]
 [200 173]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.93      0.87      1036
           1       0.69      0.46      0.56       373

    accuracy                           0.80      1409
   macro avg       0.76      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409



Managing imbalance in the dataset

- Check for the imbalance.
- Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
- Each time fit the model and see how the accuracy of the model is.

In [332]:
scaled_df

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,0.013889,0.0,0.115423,0.001275
1,0.472222,0.0,0.385075,0.215867
2,0.027778,0.0,0.354229,0.010310
3,0.625000,0.0,0.239303,0.210241
4,0.027778,0.0,0.521891,0.015330
...,...,...,...,...
7038,0.333333,0.0,0.662189,0.227521
7039,1.000000,0.0,0.845274,0.847461
7040,0.152778,0.0,0.112935,0.037809
7041,0.055556,1.0,0.558706,0.033210


In [333]:
#Checking imbalance in the data:
result_df.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [335]:
#As I will be downsizing to simplify the process I will merge the previous data:
churn_column = churnData['Churn']

In [336]:
merged_data = pd.concat([scaled_df, churn_column], axis=1)

In [337]:
data = merged_data

In [338]:
#DOWNSAMPLING:
category_0 = data[data['Churn'] == 0]
category_1 = data[data['Churn'] == 1]

In [339]:
category_0 = category_0.sample(len(category_1))
print(category_0.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


In [340]:
data = pd.concat([category_0, category_1], axis=0)
#shuffling the data
data = data.sample(frac=1)
data['Churn'].value_counts()

Churn
1    1869
0    1869
Name: count, dtype: int64

In [341]:
#fit the model and see how the accuracy of the model is
X = data.drop('Churn', axis=1)  
y = data['Churn']  

In [342]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [343]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

In [344]:
logreg.score(X_test, y_test)

0.7219251336898396

In [347]:
#UPSAMPLING with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [348]:
X_resampled, y_resampled = smote.fit_resample(X, y)

In [349]:
y_resampled.value_counts()

Churn
1    1869
0    1869
Name: count, dtype: int64

In [350]:
#fitting the model
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [351]:
logreg = LogisticRegression(random_state=42)

In [352]:
logreg.fit(X_train, y_train)

In [353]:
y_pred = logreg.predict(X_test)

In [354]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7219251336898396
