In [72]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTENC
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [36]:
os.getcwd()

'c:\\Users\\ankita\\Desktop\\Personal_Project\\Customer_churn_analytics\\src\\research'

In [37]:

churn_df = pd.read_csv("./../../artifacts/customer_churn.csv")

#### Preprocessing

In [38]:
train, test = train_test_split(churn_df, test_size=0.2, shuffle=False)

In [39]:
print(train.shape)
print(test.shape)

(5634, 21)
(1409, 21)


## Preprocessing on the train set

In [40]:
train.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5634.0,5634.0,5634.0
mean,0.160454,32.296592,64.867483
std,0.36706,24.588599,30.0996
min,0.0,0.0,18.25
25%,0.0,9.0,35.6125
50%,0.0,29.0,70.425
75%,0.0,55.0,89.9
max,1.0,72.0,118.75


In [41]:
train.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [42]:
def correct_incorrect_dtypes(X):
    if X['SeniorCitizen'].dtype in ['int64','float64']:
        X['SeniorCitizen']= X['SeniorCitizen'].astype('object')
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors = 'coerce')
    return X
        

In [43]:
train_df = correct_incorrect_dtypes(train)
test_df = correct_incorrect_dtypes(test)

In [44]:
train_df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        9
Churn               0
dtype: int64

### Special Case

In [45]:
train_df[['tenure', 'TotalCharges']][train_df['tenure'] == 0.0]


Unnamed: 0,tenure,TotalCharges
488,0,
753,0,
936,0,
1082,0,
1340,0,
3331,0,
3826,0,
4380,0,
5218,0,


* Since there are 9 customers that only had 0 month tenure and totalCharges is null. I am assumming that the customers have signed a contract but possibly canceled or left before their first billing cycle, which tells us that their TotalCharges is null.
* Either I can remove those instances but again if the unseen data has such instances that will be a problem.
* Instead of having tenure as 0 we can modify it to 0.1 this ensures the model doesnt treat them as a completely different group but still recognises them as new customer.
* Since they are not included in the billing cycle replacing the null value to 0 would be ok.

In [60]:
def impute_total_charges_for_zero_tenure(df):
    if df['tenure'][df['tenure'] == 0.0].count() > 0:
        df['tenure'][df['tenure'] == 0.0] = 0.1
        df['TotalCharges']=df['TotalCharges'].fillna(0.0)
    else:
        print("There are no tenure that has 0 months")
    return df

def categorize_dtypes(train_df):
    numerical_columns = train_df.select_dtypes(['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(['object','category']).columns[1:]
    return numerical_columns,categorical_columns

def imputation(X,numerical_columns,categorical_columns):
    median = X[numerical_columns].median()
    mode = X[categorical_columns].mode()
    X[numerical_columns]= X[numerical_columns].fillna(median)
    X[categorical_columns] =X[categorical_columns].fillna(mode[0])
    return X, median, mode

def apply_imputation(test_df,numerical_columns,categorical_columns, median, mode):
    test_df[numerical_columns]= test_df[numerical_columns].fillna(median)
    test_df[categorical_columns] =test_df[categorical_columns].fillna(mode[0])
    return test_df

In [55]:
train_df = impute_total_charges_for_zero_tenure(train_df)
test_df = impute_total_charges_for_zero_tenure(test_df)

There are no tenure that has 0 months
There are no tenure that has 0 months


In [49]:
test_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [61]:
numerical_columns_, categorical_columns_ = categorize_dtypes(train_df)


In [63]:
numertical_indices =  [train_df.columns.get_loc(col)  for col in numerical_columns_]
categorical_indices = [train_df.columns.get_loc(col) for col in categorical_columns_]

In [68]:
categorical_indices[:-1]

[1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [None]:
X_train, y_train = train_df.drop(columns = ['Churn'], axis = 1), train_df['Churn']

In [69]:
smote_nc = SMOTENC(random_state=42, categorical_features=categorical_indices[:-1])
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)


In [73]:
print(f"after smoteNC: {Counter(y_train_resampled)}")

after smoteNC: Counter({0: 4146, 1: 4146})


In [76]:
train_df_resampled=pd.concat([X_train_resampled,y_train_resampled], axis = 1)

In [78]:
def feature_scaling_encoding(train_df,test_df, numerical_columns_, categorical_columns_):
    
    standard_scl = StandardScaler()
    train_df[numerical_columns_] = standard_scl.fit_transform(train_df[numerical_columns_])
    test_df[numerical_columns_] = standard_scl.transform(test_df[numerical_columns_])
        
    for col in categorical_columns_:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
    return train_df, test_df

In [80]:
scaled_encoded_train_df, scaled_encoded_test_df = feature_scaling_encoding(train_df, test_df, numerical_columns_, categorical_columns_)

In [82]:
scaled_encoded_train_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [83]:
X_train, y_train = scaled_encoded_train_df.drop(['customerID','Churn'], axis = 1), scaled_encoded_train_df['Churn']
X_test, y_test = scaled_encoded_test_df.drop(['customerID','Churn'], axis = 1), scaled_encoded_test_df['Churn']

### train and val split

In [86]:
train_X, val_X, train_y, val_y = train_test_split(X_train,y_train, test_size=0.2, stratify=y_train)

In [88]:
print(train_X.shape)
print(train_y.shape)
print(val_X.shape)
print(val_y.shape)

(4507, 19)
(4507,)
(1127, 19)
(1127,)


In [90]:
# Check Final Class Distribution
print("Final Train Class Distribution:", Counter(train_y))  # Should still be balanced
print("Final Validation Class Distribution:", Counter(val_y))

Final Train Class Distribution: Counter({0: 3317, 1: 1190})
Final Validation Class Distribution: Counter({0: 829, 1: 298})


### Create and evalute metric after the training model

In [91]:
def evalute_model(true,predicted):
    accuracy = accuracy_score(true, predicted)
    cf_mx = confusion_matrix(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    auc_roc = roc_auc_score(true, predicted)
    return accuracy,cf_mx,precision,recall,auc_roc
    

In [92]:
models = {
    "logistic_regression" : LogisticRegression(),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(),
    "ada_boost": AdaBoostClassifier(),
    "gradient_boosting": GradientBoostingClassifier(),
    "xgboost": XGBClassifier()
}


In [93]:
model_list = []
model_mertic = {}
precision, recall = [], []
for key,value in models.items():
    model = models[key]
    model.fit(train_X, train_y)
    y_train_pred = model.predict(train_X)
    y_val_pred = model.predict(val_X)
    
    model_train_accuracy, model_train_cf_mx, model_train_precision, model_train_recall, model_train_auc = evalute_model(train_y.values, y_train_pred)
    model_val_accuracy,model_val_cf_mx, model_val_precision, model_val_recall, model_val_auc = evalute_model(val_y.values, y_val_pred)
    model_list.append(model)
    model_mertic[key] = {
        'accuracy': model_val_accuracy, 
        'confusion_matrix':model_val_cf_mx,
        'precision':model_val_precision,
        'recall': model_val_recall,
        'auc': model_val_auc
    }

In [95]:
### Before SMOTE NC

In [25]:
pd.DataFrame(model_mertic).T

Unnamed: 0,accuracy,confusion_matrix,precision,recall,auc
logistic_regression,0.793256,"[[735, 81], [152, 159]]",0.6625,0.511254,0.705995
decision_tree,0.723159,"[[661, 155], [157, 154]]",0.498382,0.495177,0.652613
random_forest,0.789707,"[[741, 75], [162, 149]]",0.665179,0.4791,0.693594
ada_boost,0.795918,"[[736, 80], [150, 161]]",0.66805,0.517685,0.709823
gradient_boosting,0.803017,"[[750, 66], [156, 155]]",0.701357,0.498392,0.708755
xgboost,0.780834,"[[714, 102], [145, 166]]",0.619403,0.533762,0.704381


### After SmoteNC

In [94]:
pd.DataFrame(model_mertic).T

Unnamed: 0,accuracy,confusion_matrix,precision,recall,auc
logistic_regression,0.803904,"[[735, 94], [127, 171]]",0.645283,0.573826,0.730218
decision_tree,0.71961,"[[663, 166], [150, 148]]",0.471338,0.496644,0.648202
random_forest,0.776398,"[[721, 108], [144, 154]]",0.587786,0.516779,0.693251
ada_boost,0.801242,"[[734, 95], [129, 169]]",0.640152,0.567114,0.726259
gradient_boosting,0.813665,"[[746, 83], [127, 171]]",0.673228,0.573826,0.736852
xgboost,0.774623,"[[718, 111], [143, 155]]",0.582707,0.520134,0.693119
