# Lab | Imbalanced data

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
churnData = pd.read_csv('customer_churn.csv')


In [3]:
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
xdata=churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
ydata=churnData['Churn']

In [5]:
xdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tenure          7043 non-null   int64  
 1   SeniorCitizen   7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 165.2 KB


In [6]:
xdata.describe()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,32.371149,0.162147,64.761692
std,24.559481,0.368612,30.090047
min,0.0,0.0,18.25
25%,9.0,0.0,35.5
50%,29.0,0.0,70.35
75%,55.0,0.0,89.85
max,72.0,1.0,118.75


In [23]:
ydata.value_counts()


No     5174
Yes    1869
Name: Churn, dtype: int64

In [24]:
ydata=np.where(ydata == 'Yes',1,0)
print(ydata)

[0 0 1 ... 0 1 0]


In [25]:
ydata=pd.DataFrame({'Churn':ydata})
ydata.head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [26]:
transformer = StandardScaler().fit(xdata)
scaled_x = transformer.transform(xdata)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_x, ydata, test_size=0.2, random_state=100)

In [28]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.78 


In [None]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

### Synthetic Minority Oversampling TEchnique (SMOTE)

In [7]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

transformer = StandardScaler().fit(xdata)
X = transformer.transform(xdata)
y = churnData['Churn']
X_sm, y_sm = smote.fit_sample(X, y)
y_sm = pd.DataFrame(data=np.array(y_sm).flatten())
y_sm[0].value_counts()

Yes    5174
No     5174
Name: 0, dtype: int64

## Model

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=100)

In [10]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.73 


In [11]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.46 


### UnderSampling using TomekLinks 

In [12]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(xdata, ydata)
y_tl = pd.DataFrame(data=np.array(y_tl).flatten())
y_tl[0].value_counts()

No     4711
Yes    1869
Name: 0, dtype: int64

In [13]:
X_tl2, y_tl2 = tl.fit_sample(X_tl, y_tl)
y_tl2 = pd.DataFrame(data=np.array(y_tl2).flatten())
y_tl2[0].value_counts()

No     4575
Yes    1869
Name: 0, dtype: int64

In [14]:
transformer = StandardScaler().fit(X_tl)
X = transformer.transform(X_tl)

## Model

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tl2, y_tl2, test_size=0.2, random_state=100)

In [17]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.80 


In [18]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.45 
