## Model Training

In [168]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [169]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [170]:
df=pd.read_csv('data/data.csv')

In [171]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [172]:
df=df.drop('customerID',axis=1)

In [173]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

In [174]:
df=df.dropna(subset=['TotalCharges'])

In [175]:
X=df.drop(columns=['Churn'],axis=1)

In [176]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [177]:
y=df['Churn']

In [178]:
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7032, dtype: object

In [179]:
#splitting train and test set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [180]:
#encode target
from sklearn.preprocessing import   LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [181]:
num_features=['tenure','MonthlyCharges','SeniorCitizen']
cat_features=X.select_dtypes(include='object').columns.tolist()

In [182]:
cat_features

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [183]:
num_features

['tenure', 'MonthlyCharges', 'SeniorCitizen']

In [184]:
ordinal_features=['Contract']
nominal_features=[col for col in cat_features if col != 'Contract']


In [185]:
nominal_features

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'PaymentMethod']

In [186]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline

#function to create AvgChargesPerMonth
def avg_charges(Z):
    TotalCharges=Z['TotalCharges'].values
    Tenure=Z['tenure'].values
    return (TotalCharges/(Tenure + 1e-5)).reshape(-1,1)

avg_pipeline=Pipeline([
    ('avg',FunctionTransformer(avg_charges,validate=False)),
    ('scale',StandardScaler())
    ])

preprocessor=ColumnTransformer(
    transformers=[
        ('numeric',StandardScaler(),num_features),
        ('avg',avg_pipeline,['TotalCharges','tenure']),
        ('nominal',OneHotEncoder(drop='first'),nominal_features),
        ('ordinal',OrdinalEncoder(categories=[['Month-to-month','One year','Two year']]),ordinal_features)
        ]
)


In [187]:
clf=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(max_iter=1000))
])

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [189]:
print('Accuracy', accuracy_score(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy 0.7882018479033405
Confusion Matrix:
 [[917 116]
 [182 192]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.62      0.51      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



- The accuracy is quite low, and the target columns is imbalanced so we shouldn't consider Accuracy as metrics here to evaluate the performance of the model.

- When we see recall, precision and f1_score, the score is too low for class 1 i.e churned customers, so we need to improve our model.
