In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

1. Load the dataset and explore the variables.

In [2]:
churnData = pd.read_csv('files_for_lab/customer_churn.csv',sep=",")
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

2. We will try to predict variable `Churn` using a logistic regression on variables `tenure`, `SeniorCitizen`,`MonthlyCharges`.

In [4]:
churnData = churnData[['tenure', 'SeniorCitizen','MonthlyCharges','Churn']]

In [5]:
churnData

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.30,No
4,2,0,70.70,Yes
...,...,...,...,...
7038,24,0,84.80,No
7039,72,0,103.20,No
7040,11,0,29.60,No
7041,4,1,74.40,Yes


3. Extract the target variable.

In [6]:
y=churnData['Churn']
X=churnData.drop('Churn', axis =1)

In [7]:
X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


5. Build the logistic regression model.

In [8]:
transformer = StandardScaler().fit(X)
X = transformer.transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

6. Evaluate the model.

In [11]:
classification.score(X_test,y_test)

0.7936583057264552

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



In [13]:
print(confusion_matrix(y_test, predictions,normalize='all'))

[[0.67203029 0.05631803]
 [0.15002366 0.12162802]]


7. Even a simple model will give us more than 70% accuracy. Why?



In [14]:
#because there is a big invalance in the data, over 73% of the data is "No" so if 
#we set "No" to all predictions the accuracy will be 73%
5174/(5174+1869)

0.7346301292063041

---------------

8. **Synthetic Minority Oversampling TEchnique (SMOTE)** is an over sampling technique based on nearest neighbors that adds new points between existing points. Apply `imblearn.over_sampling.SMOTE` to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)


In [17]:
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

In [18]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.76      0.74      0.75      1574
         Yes       0.74      0.76      0.75      1531

    accuracy                           0.75      3105
   macro avg       0.75      0.75      0.75      3105
weighted avg       0.75      0.75      0.75      3105



In [19]:
classification.score(X_test, y_test)

0.7494363929146538

In [20]:
print(confusion_matrix(y_test, predictions,normalize='all'))

[[0.37552335 0.13140097]
 [0.11916264 0.37391304]]


There is an improvement is the class separation with a more even precission between predictions

-----------

9. **Tomek links** are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. Apply `imblearn.under_sampling.TomekLinks` to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?


In [21]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

No     4697
Yes    1869
Name: Churn, dtype: int64

In [22]:
X_tl2, y_tl2 = tl.fit_resample(X_tl, y_tl)
y_tl2.value_counts()

No     4544
Yes    1869
Name: Churn, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)


In [24]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.83      0.91      0.87      1390
         Yes       0.71      0.54      0.61       580

    accuracy                           0.80      1970
   macro avg       0.77      0.72      0.74      1970
weighted avg       0.79      0.80      0.79      1970



In [25]:
classification.score(X_test, y_test)

0.8

In [26]:
print(confusion_matrix(y_test, predictions,normalize='all'))

[[0.64162437 0.06395939]
 [0.13604061 0.15837563]]


there is an improvement in overall score but less precision for the value with low representation

---------------------------------

*.Aplying both methods for testing

In [33]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(X_sm, y_sm)
y_tl.value_counts()

Yes    5174
No     4694
Name: Churn, dtype: int64

In [34]:
X_tl2, y_tl2 = tl.fit_resample(X_tl, y_tl)
y_tl2.value_counts()

Yes    5079
No     4694
Name: Churn, dtype: int64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

In [30]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.76      0.73      0.74      1408
         Yes       0.76      0.79      0.78      1553

    accuracy                           0.76      2961
   macro avg       0.76      0.76      0.76      2961
weighted avg       0.76      0.76      0.76      2961



In [31]:
classification.score(X_test, y_test)

0.7615670381627828

In [32]:
print(confusion_matrix(y_test, predictions,normalize='all'))

[[0.34785545 0.12765957]
 [0.11077339 0.41371158]]


Not much improvement