In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier


In [2]:
os.getcwd()

'c:\\Users\\ankita\\Desktop\\Personal_Project\\Customer_churn_analytics\\src\\research'

In [3]:

churn_df = pd.read_csv("./../../artifacts/customer_churn.csv")

#### Preprocessing

In [4]:
train, test = train_test_split(churn_df, test_size=0.2, shuffle=False)

In [5]:
print(train.shape)
print(test.shape)

(5634, 21)
(1409, 21)


## Preprocessing on the train set

In [6]:
train.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5634.0,5634.0,5634.0
mean,0.160454,32.296592,64.867483
std,0.36706,24.588599,30.0996
min,0.0,0.0,18.25
25%,0.0,9.0,35.6125
50%,0.0,29.0,70.425
75%,0.0,55.0,89.9
max,1.0,72.0,118.75


In [7]:
train.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
def correct_incorrect_dtypes(X):
    if X['SeniorCitizen'].dtype in ['int64','float64']:
        X['SeniorCitizen']= X['SeniorCitizen'].astype('object')
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors = 'coerce')
    return X
        

In [9]:
train_df = correct_incorrect_dtypes(train)
test_df = correct_incorrect_dtypes(test)

In [10]:
train_df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        9
Churn               0
dtype: int64

### Special Case

In [11]:
train_df[['tenure', 'TotalCharges']][train_df['tenure'] == 0.0]


Unnamed: 0,tenure,TotalCharges
488,0,
753,0,
936,0,
1082,0,
1340,0,
3331,0,
3826,0,
4380,0,
5218,0,


* Since there are 9 customers that only had 0 month tenure and totalCharges is null. I am assumming that the customers have signed a contract but possibly canceled or left before their first billing cycle, which tells us that their TotalCharges is null.
* Either I can remove those instances but again if the unseen data has such instances that will be a problem.
* Instead of having tenure as 0 we can modify it to 0.1 this ensures the model doesnt treat them as a completely different group but still recognises them as new customer.
* Since they are not included in the billing cycle replacing the null value to 0 would be ok.

In [12]:
def impute_total_charges_for_zero_tenure(df):
    if df['tenure'][df['tenure'] == 0.0].count() > 0:
        df['tenure'][df['tenure'] == 0.0] = 0.1
        df['TotalCharges']=df['TotalCharges'].fillna(0.0)
    else:
        print("There are no tenure that has 0 months")
    return df

def categorize_dtypes(train_df):
    numerical_columns = train_df.select_dtypes(['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(['object','category']).columns[1:]
    return numerical_columns,categorical_columns

def imputation(X,numerical_columns,categorical_columns):
    median = X[numerical_columns].median()
    mode = X[categorical_columns].mode()
    X[numerical_columns]= X[numerical_columns].fillna(median)
    X[categorical_columns] =X[categorical_columns].fillna(mode[0])
    return X, median, mode

def apply_imputation(test_df,numerical_columns,categorical_columns, median, mode):
    test_df[numerical_columns]= test_df[numerical_columns].fillna(median)
    test_df[categorical_columns] =test_df[categorical_columns].fillna(mode[0])
    return test_df

In [13]:
train_df = impute_total_charges_for_zero_tenure(train_df)
test_df = impute_total_charges_for_zero_tenure(test_df)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['tenure'][df['tenure'] == 0.0] = 0.1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tenure'][df['tenure

In [14]:
test_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [15]:
numerical_columns_, categorical_columns_ = categorize_dtypes(train_df)

In [16]:
def feature_scaling_encoding(train_df,test_df, numerical_columns_, categorical_columns_):
    
    standard_scl = StandardScaler()
    train_df[numerical_columns_] = standard_scl.fit_transform(train_df[numerical_columns_])
    test_df[numerical_columns_] = standard_scl.transform(test_df[numerical_columns_])
        
    for col in categorical_columns_:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
    return train_df, test_df

In [17]:
scaled_encoded_train_df, scaled_encoded_test_df = feature_scaling_encoding(train_df, test_df, numerical_columns_, categorical_columns_)

In [18]:
scaled_encoded_test_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [19]:
X_train, y_train = scaled_encoded_train_df.drop(['customerID','Churn'], axis = 1), scaled_encoded_train_df['Churn']
X_test, y_test = scaled_encoded_test_df.drop(['customerID','Churn'], axis = 1), scaled_encoded_test_df['Churn']

### train and val split

In [28]:
train_X, val_X, train_y, val_y = train_test_split(X_train,y_train, test_size=0.2, shuffle = True)

In [29]:
print(train_X.shape)
print(train_y.shape)
print(val_X.shape)
print(val_y.shape)

(4507, 19)
(4507,)
(1127, 19)
(1127,)


### Create and evalute metric after the training model

In [30]:
def evalute_model(true,predicted):
    accuracy = accuracy_score(true, predicted)
    cf_mx = confusion_matrix(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    auc_roc = roc_auc_score(true, predicted)
    return accuracy,cf_mx,precision,recall,auc_roc
    

In [31]:
models = {
    "logistic_regression" : LogisticRegression(),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(),
    "ada_boost": AdaBoostClassifier(),
    "gradient_boosting": GradientBoostingClassifier(),
    "xgboost": XGBClassifier()
}


In [32]:
model_list = []
model_mertic = {}
precision, recall = [], []
for key,value in models.items():
    model = models[key]
    model.fit(train_X, train_y)
    y_train_pred = model.predict(train_X)
    y_val_pred = model.predict(val_X)
    
    model_train_accuracy, model_train_cf_mx, model_train_precision, model_train_recall, model_train_auc = evalute_model(train_y.values, y_train_pred)
    model_val_accuracy,model_val_cf_mx, model_val_precision, model_val_recall, model_val_auc = evalute_model(val_y.values, y_val_pred)
    model_list.append(model)
    model_mertic[key] = {
        'accuracy': model_val_accuracy, 
        'confusion_matrix':model_val_cf_mx,
        'precision':model_val_precision,
        'recall': model_val_recall,
        'auc': model_val_auc
    }

In [33]:
pd.DataFrame(model_mertic).T

Unnamed: 0,accuracy,confusion_matrix,precision,recall,auc
logistic_regression,0.792369,"[[737, 84], [150, 156]]",0.65,0.509804,0.703745
decision_tree,0.732032,"[[672, 149], [153, 153]]",0.506623,0.5,0.659257
random_forest,0.79858,"[[748, 73], [154, 152]]",0.675556,0.496732,0.703908
ada_boost,0.800355,"[[744, 77], [148, 158]]",0.67234,0.51634,0.711276
gradient_boosting,0.808341,"[[751, 70], [146, 160]]",0.695652,0.522876,0.718807
xgboost,0.787933,"[[727, 94], [145, 161]]",0.631373,0.526144,0.705825
