In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import imblearn

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

**1. Data cleaning and splitting to train and test sets**

In [4]:
churnData = pd.read_csv('../DATA_Customer-Churn.txt')

In [5]:
churnData.shape

(7043, 16)

In [6]:
churnData.head(4)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No


In [7]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [9]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [10]:
churnData[churnData['TotalCharges'].isna()]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No


In [11]:
churnData['tenure'].value_counts(ascending=True).head(2)

tenure
0     11
36    50
Name: count, dtype: int64

In [12]:
# drop rows with Null values for total charge - these are also the clients with 0 tenure time 
# only 11 rows 

churnData.dropna(inplace=True)

In [13]:
numeric = churnData.select_dtypes(include='number')

In [14]:
categorical = churnData.select_dtypes(include='object')

In [15]:
for col in categorical:
    print(categorical[col].value_counts())

gender
Male      3549
Female    3483
Name: count, dtype: int64
Partner
No     3639
Yes    3393
Name: count, dtype: int64
Dependents
No     4933
Yes    2099
Name: count, dtype: int64
PhoneService
Yes    6352
No      680
Name: count, dtype: int64
OnlineSecurity
No                     3497
Yes                    2015
No internet service    1520
Name: count, dtype: int64
OnlineBackup
No                     3087
Yes                    2425
No internet service    1520
Name: count, dtype: int64
DeviceProtection
No                     3094
Yes                    2418
No internet service    1520
Name: count, dtype: int64
TechSupport
No                     3472
Yes                    2040
No internet service    1520
Name: count, dtype: int64
StreamingTV
No                     2809
Yes                    2703
No internet service    1520
Name: count, dtype: int64
StreamingMovies
No                     2781
Yes                    2731
No internet service    1520
Name: count, dtype: int64
Contract
M

In [16]:
cat_dummies = pd.get_dummies(categorical, drop_first=True)

In [17]:
cat_dummies.shape, numeric.shape

((7032, 19), (7032, 4))

In [18]:
churn_df = pd.concat([numeric, pd.get_dummies(categorical, drop_first=True)], axis=1)

**Splitting and scaling the data**

In [19]:
churn_df.head(3)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
2,0,2,53.85,108.15,True,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,True


In [20]:
X = churn_df.drop('Churn_Yes', axis=1).copy()
y = churn_df['Churn_Yes'].copy()

In [21]:
def split_scale(X, y, scaler = MinMaxScaler(), test_size=0.2, random_state=None):
    """Function that splits data into train and test sets and scales the features.
    
    Required Parameters:
        - X - data frame with features
        - y - target variable
    Optiona Parameters:
        - scaler - how to scale features, by default MinMaxScaler() is used
        - test_size - by default 20% of data
        - random_state - by default None
        
    Returns: training and test data sets
        X_train, X_test, y_train, y_test    
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    scaler = scaler
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test

In [22]:
X_train, X_test, y_train, y_test = split_scale(X, y) #, scale=StandardScaler())

In [23]:
X_train.shape[0], X_test.shape[0]

(5625, 1407)

**Logistic Regression Model**

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [34]:
def model_evaluation(X_test, y_test, y_pred): 
    
    scores = {
        #'score' : ['{:.3f}'.format(model.score(X_test, y_test))],
        'precission' : ['{:.3f}'.format(precision_score(y_test, y_pred))],
        'accuracy' : ['{:.3f}'.format(accuracy_score(y_test, y_pred))],
        'recall' : ['{:.3f}'.format(recall_score(y_test, y_pred))]     
    }
  
    summary = pd.DataFrame(scores)
    
    return summary

In [36]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

logReg = LogisticRegression()

logReg.fit(X_train, y_train)

y_pred_train_reg = logReg.predict(X_train)

train = model_evaluation(X_train, y_train, y_pred_train_reg)


y_pred = logReg.predict(X_test)

test = model_evaluation(X_test, y_test, y_pred)


print("Model ecaluaition on training data:")
display(train)
print()
print("Model evaluation for test data:")
display(test)

Model ecaluaition on training data:


Unnamed: 0,precission,accuracy,recall
0,0.659,0.804,0.544



Model evaluation for test data:


Unnamed: 0,precission,accuracy,recall
0,0.612,0.786,0.508


In [37]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[918, 119],
       [182, 188]], dtype=int64)

**Knn Classifier**


In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
knnClass = KNeighborsClassifier(n_neighbors=5)

knnClass.fit(X_train, y_train)

y_train_pred_knn = knnClass.predict(X_train)

train = model_evaluation(X_train, y_train, y_train_pred_knn)

# ======
y_pred_knn = knnClass.predict(X_test)

test = model_evaluation(X_test, y_test, y_pred_knn)

print("Model ecaluaition on training data:")
display(train)
print()
print("Model evaluation for test data:")
display(test)

Model ecaluaition on training data:


Unnamed: 0,precission,accuracy,recall
0,0.734,0.839,0.624



Model evaluation for test data:


Unnamed: 0,precission,accuracy,recall
0,0.565,0.767,0.495


**Tree Classifier**

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree, export_text

In [41]:
tree = DecisionTreeClassifier(max_depth=10)

tree.fit(X_train, y_train)

y_train_pred_tree = tree.predict(X_train)

train = model_evaluation(X_train, y_train, y_train_pred_tree)

y_pred_tree = tree.predict(X_test)

test = model_evaluation(X_test, y_test, y_pred_tree)

print("Model ecaluaition on training data:")
display(train)
print()
print("Model evaluation for test data:")
display(test)

Model ecaluaition on training data:


Unnamed: 0,precission,accuracy,recall
0,0.791,0.865,0.672



Model evaluation for test data:


Unnamed: 0,precission,accuracy,recall
0,0.555,0.763,0.503


**Cross validation**

In [47]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [44]:
logReg = LogisticRegression()

scoring_metrics = ['accuracy', 'precision', 'recall']

cv_scores = cross_validate(logReg, X_train, y_train, scoring=scoring_metrics, cv=5)

In [45]:
for k, v in cv_scores.items():
    print(f"{k}: {v.mean():.3f}")

    #cv_scores

fit_time: 0.019
score_time: 0.008
test_accuracy: 0.801
test_precision: 0.655
test_recall: 0.537


In [46]:
cv_scores

{'fit_time': array([0.02219486, 0.02782297, 0.01548409, 0.01920676, 0.01213193]),
 'score_time': array([0.00998902, 0.00697994, 0.00549579, 0.00802422, 0.00801897]),
 'test_accuracy': array([0.78755556, 0.79733333, 0.79911111, 0.80266667, 0.81866667]),
 'test_precision': array([0.62151394, 0.64285714, 0.65416667, 0.65354331, 0.70386266]),
 'test_recall': array([0.52      , 0.54      , 0.52333333, 0.55333333, 0.54849498])}

In [54]:
scores = cross_val_score(logReg, X_train, y_train, cv=5, scoring='precision')

In [51]:
print("Score recall:", scores) 

Score recall: [0.52       0.54       0.52333333 0.55333333 0.54849498]


In [53]:
print("Score accuracy:", scores) 

Score accuracy: [0.78755556 0.79733333 0.79911111 0.80266667 0.81866667]


In [55]:
print("Score precission:", scores) 

Score precission: [0.62151394 0.64285714 0.65416667 0.65354331 0.70386266]


In [None]:
X_train.shape

In [56]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'penalty': ['l1', 'l2']
}

# Create logistic regression model
logreg_model = LogisticRegression(max_iter=100)

# Use GridSearchCV with cross-validation
grid_search = GridSearchCV(logreg_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_logreg_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


90 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [63]:
grid_search.best_params_  

{'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

In [67]:
best_logreg =  LogisticRegression(**grid_search.best_params_)

best_logreg.fit(X_train, y_train)

y_best_pred = best_logreg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_best_pred))
print("Precision:", precision_score(y_test, y_best_pred))
print("Recall : ", recall_score(y_test, y_best_pred))

Accuracy: 0.7853589196872779
Precision: 0.6096774193548387
Recall :  0.5108108108108108


In [68]:
confusion_matrix(y_test, y_best_pred)

array([[916, 121],
       [181, 189]], dtype=int64)

In [74]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'penalty': ['l1', 'l2']
}
 # Create logistic regression model
logreg_model = LogisticRegression(max_iter=100)

# Use GridSearchCV with cross-validation
grid_search = GridSearchCV(logreg_model, param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

 # Get the best model
best_logreg_model = grid_search.best_estimator_

# # Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


90 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [75]:
grid_search.best_params_

{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

In [76]:
best_logreg_recall =  LogisticRegression(**grid_search.best_params_)

best_logreg_recall.fit(X_train, y_train)

y_best_pred_recall = best_logreg_recall.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_best_pred_recall))
print("Precision:", precision_score(y_test, y_best_pred_recall))
print("Recall : ", recall_score(y_test, y_best_pred_recall))

Accuracy: 0.7853589196872779
Precision: 0.6096774193548387
Recall :  0.5108108108108108


**Random forest classifier**

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
randForest = RandomForestClassifier()

randForest.fit(X_train, y_train)

y_randFor_pred = randForest.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_randFor_pred))
print("Precision:", precision_score(y_test, y_randFor_pred))
print("Recall : ", recall_score(y_test, y_randFor_pred))

Accuracy: 0.7775408670931059
Precision: 0.5934426229508196
Recall :  0.4891891891891892


In [81]:
randFor = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}


grid_searchForest = GridSearchCV(randFor, param_grid=param_grid, cv=3)

grid_searchForest.fit(X_train, y_train)

In [82]:
grid_searchForest.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 50}

In [83]:
RF = RandomForestClassifier(**grid_searchForest.best_params_)

RF.fit(X_train, y_train)

y_RF_pred = RF.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_RF_pred))
print("Precision:", precision_score(y_test, y_RF_pred))
print("Recall : ", recall_score(y_test, y_RF_pred))


Accuracy: 0.7860696517412935
Precision: 0.6210526315789474
Recall :  0.4783783783783784


 - By default accuracy scoring was improved only, so for recall algoritm performs worse than before (but time of gridsearchcv is too long to change it and adjust params again... and anyway resampling od data would bring bigger effect than parameters tuning) 

## Managing imbalance in the dataset

 - The imbalance should be checked only on train set - where atributes and target variable are in the same data frame (so concat X_train and y_train - atributes are already scaled)
 

In [86]:
train = pd.concat([X_train, y_train], axis=1)
train.head(5)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,Churn_Yes
0,1.0,0.0,0.567581,0.006514,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,0.0,0.309859,0.854364,0.28286,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,True
2,0.0,0.15493,0.0798,0.034168,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,False
3,0.0,0.84507,0.798504,0.706945,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,False
4,0.0,0.197183,0.804988,0.162423,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,True


In [88]:
train['Churn_Yes'].value_counts(dropna=False)

Churn_Yes
False    4126
True     1499
Name: count, dtype: int64

In [89]:
no_churn = train[train['Churn_Yes']==False]
yes_churn = train[train['Churn_Yes']==True]

**Upsampling of 'True' values (OVERSAMLING)**

In [93]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [94]:
# oversample minority
yes_churn_up = resample(yes_churn, #<- sample from here
                        replace=True, #<- we need replacement, since we don't have enough data otherwise
                        n_samples = len(no_churn),#<- make both sets the same size
                        random_state=0)

In [95]:
yes_churn_up.shape

(4126, 23)

In [98]:
train_up = pd.concat([no_churn, yes_churn_up], axis=0)
train_up.shape

(8252, 23)

In [99]:
train_up['Churn_Yes'].value_counts()

Churn_Yes
False    4126
True     4126
Name: count, dtype: int64

In [101]:
X_train_up = train_up.drop(['Churn_Yes'], axis=1).copy()
y_train_up = train_up['Churn_Yes'].copy()

# Data is already scaled and splitted

In [103]:
LR_up = LogisticRegression()

LR_up.fit(X_train_up, y_train_up)

y_pred_up = LR_up.predict(X_test)

model_evaluation(X_test, y_test, y_pred_up)

Unnamed: 0,precission,accuracy,recall
0,0.518,0.752,0.8


**Downsampling of 'No' values (DOWNSMLING)**

In [96]:
no_churn_down = resample(no_churn, replace=False, n_samples=len(yes_churn), random_state=0)

In [97]:
no_churn_down.shape

(1499, 23)

In [104]:
train_down = pd.concat([yes_churn, no_churn_down], axis=0)

In [105]:
train_down['Churn_Yes'].value_counts()

Churn_Yes
True     1499
False    1499
Name: count, dtype: int64

In [106]:
X_train_down = train_down.drop('Churn_Yes', axis=1).copy()
y_train_down = train_down['Churn_Yes'].copy()


In [107]:
LR_dw = LogisticRegression()

LR_dw.fit(X_train_down, y_train_down)

y_pred_dw = LR_dw.predict(X_test)

model_evaluation(X_test, y_test, y_pred_dw)

Unnamed: 0,precission,accuracy,recall
0,0.516,0.751,0.811
