## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [1]:
import pandas as pd

In [27]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [4]:
# le.fit(train['gender'])
# le.transform(train['gender'])

array([1, 1, 1, ..., 0, 0, 0])

In [28]:
categorical_columns = ['gender',  'Partner', 'Dependents', 
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
        'Churn']

In [6]:
train.head()[categorical_columns]

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Male,No,No,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),No
1,Male,Yes,No,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,No
2,Male,Yes,Yes,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,No
3,Male,No,No,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,No
4,Male,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),No


In [29]:
column_mapper = {}

for column in categorical_columns:
    le = LabelEncoder()
    le.fit(train.loc[:, column])
    #train.loc[:, column] = le.transform(train.loc[:, column])
    column_mapper[column] = le

In [8]:
column_mapper


{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder(),
 'Churn': LabelEncoder()}

In [9]:
train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,0,0,5,1,0,0,0,0,2,2,2,2,0,0,0,75.15,392.65,0
1,1,0,1,0,66,1,2,0,0,0,0,2,0,2,0,0,2,63.85,4264.6,0
2,1,0,1,1,42,1,0,0,0,0,2,2,2,2,1,0,2,73.15,3088.25,0
3,1,0,0,0,19,1,2,0,0,0,0,0,2,2,0,0,3,69.60,1394.55,0
4,1,0,0,0,59,1,0,2,1,1,1,1,1,1,2,1,0,20.20,1192.3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,1,0,0,0,1,1,0,2,1,1,1,1,1,1,0,0,0,20.20,20.2,0
5278,1,0,1,0,2,1,0,1,0,0,2,0,0,0,0,0,2,76.40,151.8,1
5279,0,0,1,0,58,1,2,0,2,0,0,2,0,2,1,1,2,68.40,3972.25,0
5280,0,0,0,0,1,1,0,1,0,2,0,0,0,0,0,1,2,75.70,75.7,1


In [34]:
def pre_process_data(df, label_encoder_dict):
    df_out = df.copy()
    df_out.replace(" ", 0, inplace=True)
    df_out.loc[:, 'TotalCharges'] = pd.to_numeric(df_out.loc[:, "TotalCharges"])
    if 'customerID' in df.columns:
        df_out.drop('customerID', axis =1, inplace=True)
    for column, le in label_encoder_dict.items():
        df_out.loc[:, column] = le.transform(df.loc[:, column])

    return df_out

In [43]:
train_processed = pre_process_data(train, column_mapper)


val_processed = pre_process_data(val, column_mapper)
val_processed

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0,39,1,2,1,0,0,2,0,2,2,0,1,2,99.75,4036.00,0
1,1,0,0,0,63,1,2,1,0,2,2,0,2,2,2,1,0,104.75,6536.50,0
2,0,0,0,0,72,1,2,2,1,1,1,1,1,1,2,0,1,25.45,1866.45,0
3,1,0,0,0,1,1,0,2,1,1,1,1,1,1,0,0,3,20.05,20.05,0
4,1,0,0,1,52,1,2,0,2,2,0,2,2,2,2,1,3,85.15,4461.85,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,0,0,1,1,70,1,2,0,2,2,2,2,2,0,2,0,2,80.70,5617.95,0
757,0,0,1,0,62,1,2,1,0,0,2,0,2,2,0,1,2,101.35,6164.70,0
758,0,0,0,0,3,1,0,2,1,1,1,1,1,1,0,0,3,20.40,63.15,0
759,1,0,0,0,4,1,0,1,0,0,0,0,2,0,0,0,2,77.85,299.20,1


In [44]:
x_train = train_processed.drop("Churn", axis =1)
y_train = train_processed.loc[:, "Churn"].astype(int)

In [47]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression( max_iter=1000)

model.fit(x_train, y_train)

In [49]:
x_val = val_processed.drop('Churn', axis=1)
y_val = val_processed.loc[:, "Churn"].astype(int)
predictions = model.predict(x_val)
predictions

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,

In [52]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, predictions)
print(f"Classification accuracy: {round(accuracy,3)}")



Classification accuracy: 0.834


In [53]:
pd.DataFrame(model.coef_, columns=x_val.columns)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,-0.057619,0.321891,0.029435,-0.247672,-0.068859,-0.505566,0.096381,0.072848,-0.299781,-0.130891,-0.149119,-0.259175,0.017043,0.032726,-0.751445,0.370455,-0.012495,0.016495,0.000422


In [54]:
import pickle

In [57]:
with open("./models/churn_prediction_model.pkl", "wb") as pickler: # write binary
    pickle.dump(model, pickler)
    
with open("./models/churn_prediction_label_encoder.pkl", "wb") as pickler:
    pickle.dump(column_mapper, pickler)
    
    