In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score, r2_score, max_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import cross_val_score,cross_validate
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn import set_config

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


This seems like a pretty complete dataset with valuable information we can se for various analysis:
- Clustering for customer segmentation, understanding the different types of user
- Understanding Customer Lifetime Value (CLTV)
- Describing Churn, it actually looks like there is already some analysis of this available in a column labeled "Churn"
- Potentially looking at conversion for things like additional services such as: Movies, protection, phone service, etc..

Let's see what some of these values look like in the dataset:

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


Let's attempt churn prediction first, since this is already clearly set up for this analysis. I will compare logistic regression and decision tree classifier algorithms to find the better model. For first models, I will try to use the numerical features that are available as is, before modifying features for improved accuracy.

In [5]:
#df['Churn'] = df['Churn'].str.replace("No","0")
#df['Churn'] = df['Churn'].str.replace("Yes","1")
#df.head()

df['Churn'] = df['Churn'].replace("No",0)
df['Churn'] = df['Churn'].replace("Yes",1)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [6]:
df_1 = df['Churn'].astype(np.int64)
#df_1.head()
df['Churn'] = df_1
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
df.corr()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn
SeniorCitizen,1.0,0.016567,0.220173,0.150889
tenure,0.016567,1.0,0.2479,-0.352229
MonthlyCharges,0.220173,0.2479,1.0,0.193356
Churn,0.150889,-0.352229,0.193356,1.0


While there seems to be minimal correlation with the numerical features already present, I can establish a baseline model:

In [9]:
df_ch = df.groupby('Churn').count()
#df_ch = df['Churn'].groupby()
df_ch.head()

Unnamed: 0_level_0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174
1,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869


In [10]:
train_cols = ['SeniorCitizen','tenure','MonthlyCharges']
target = ['Churn']

X = df[train_cols]
y = df[target]

# There is a very high ratio of no-churn to churn, so we should use the 'stratify' argument when sampling the data to try to get a more representative sample
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, stratify=y, random_state=42)

In [11]:
# Create the model:
LogReg_model = LogisticRegression(random_state=0)
LogReg_model.fit(X_train,y_train.values.ravel()) 

LogisticRegression(random_state=0)

In [12]:
# Train set metrics:
y_true_train = y_train
y_preds_train = LogReg_model.predict(X_train)

accuracy = accuracy_score(y_true_train,y_preds_train)
precision = precision_score(y_true_train,y_preds_train)
recall = recall_score(y_true_train,y_preds_train)

print("metrics on the train set: accuracy",accuracy,'precision',precision,'recall',recall)

metrics on the train set: accuracy 0.7926725362113036 precision 0.6531531531531531 recall 0.4657387580299786


In [13]:
# Test set metrics:
y_true = y_test
y_preds = LogReg_model.predict(X_test)

accuracy = accuracy_score(y_true,y_preds)
precision = precision_score(y_true,y_preds)
recall = recall_score(y_true,y_preds)

print('metrics on the train set: accuracy',accuracy,'precision',precision,'recall',recall)

metrics on the train set: accuracy 0.7884724588302101 precision 0.6448170731707317 recall 0.45240641711229945


### It looks like the initial Logistic Regression model has about 79% accuracy, and precision and recall could also be improved upon.
### Let's see how encoding / scaling the other features could help:

In [14]:
# Let's encode:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['Partner'] = df['Partner'].map({'No': 0, 'Yes': 1})
df['Dependents'] = df['Dependents'].map({'No': 0, 'Yes': 1})
df['PhoneService'] = df['PhoneService'].map({'No': 0, 'Yes': 1})
df['InternetService'] = df['InternetService'].map({'No': 0, 'DSL': 1, 'Fiber optic': 1, 'Cable': 1})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'No': 0, 'No internet service': 0, 'Yes': 1})
df['OnlineBackup'] = df['OnlineBackup'].map({'No': 0, 'No internet service': 0, 'Yes': 1})
df['DeviceProtection'] = df['DeviceProtection'].map({'No': 0, 'No internet service': 0, 'Yes': 1})
df['TechSupport'] = df['TechSupport'].map({'No': 0, 'No internet service': 0, 'Yes': 1})

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,1,0,1,0,No phone service,1,0,...,0,0,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,No,1,1,...,1,0,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,No,1,1,...,0,0,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,No phone service,1,1,...,1,1,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,1,0,0,0,2,1,No,1,0,...,0,0,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   int64  
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   int64  
 9   OnlineSecurity    7043 non-null   int64  
 10  OnlineBackup      7043 non-null   int64  
 11  DeviceProtection  7043 non-null   int64  
 12  TechSupport       7043 non-null   int64  
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [16]:
## Try converting some cols to int64:
cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport']
df[cols] = df[cols].applymap(np.int64)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   int64  
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   int64  
 9   OnlineSecurity    7043 non-null   int64  
 10  OnlineBackup      7043 non-null   int64  
 11  DeviceProtection  7043 non-null   int64  
 12  TechSupport       7043 non-null   int64  
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [17]:
df['MonthlyCharges'] = df['MonthlyCharges'].round()

df_test = df['MonthlyCharges'].astype(np.int64)
df['MonthlyCharges'] = df_test

#Scale MonthlyCharges Column:
df['MonthlyCharges'] = (df['MonthlyCharges'] - df['MonthlyCharges'].mean())/df['MonthlyCharges'].std()

In [18]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,1,0,1,0,No phone service,1,0,...,0,0,No,No,Month-to-month,Yes,Electronic check,-1.155097,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,No,1,1,...,1,0,No,No,One year,No,Mailed check,-0.257838,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,No,1,1,...,0,0,No,No,Month-to-month,Yes,Mailed check,-0.357533,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,No phone service,1,1,...,1,1,No,No,One year,No,Bank transfer (automatic),-0.756315,1840.75,0
4,9237-HQITU,1,0,0,0,2,1,No,1,0,...,0,0,No,No,Month-to-month,Yes,Electronic check,0.207407,151.65,1


In [19]:
#Let's try re-train, test, fit model with these new features:
Encoded_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Dependents', 'gender', 'Partner']
charges_cols = ['MonthlyCharges']
train_cols = charges_cols+Encoded_cols+['SeniorCitizen','tenure']
target = ['Churn']

X = df[train_cols]
y = df[target]

# There is a very high ratio of no-churn to churn, so we should use the 'stratify' argument when sampling the data to try to get a more representative sample
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [20]:
# Create the model:
LogReg_model = LogisticRegression(random_state=0)
#LogReg_model.fit(X_train,y_train) 
LogReg_model.fit(X_train,y_train.values.ravel()) 

LogisticRegression(random_state=0)

In [21]:
# Train set metrics:
y_true_train = y_train
y_preds_train = LogReg_model.predict(X_train)

accuracy = accuracy_score(y_true_train,y_preds_train)
precision = precision_score(y_true_train,y_preds_train)
recall = recall_score(y_true_train,y_preds_train)

print("metrics on the train set: accuracy",accuracy,'precision',precision,'recall',recall)

metrics on the train set: accuracy 0.7963488843813388 precision 0.650197628458498 recall 0.5030581039755352


### It looks like 79% is close to the accuracy limit with these features, let's try to add in the harder to get features:

In [22]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors = 'coerce')

In [29]:
# Now scale 'TotalCharges':
df['TotalCharges'] = (df['TotalCharges'] - df['TotalCharges'].mean())/df['TotalCharges'].std()
# Scale tenure as well:
df['tenure'] = (df['tenure'] - df['tenure'].mean())/df['tenure'].std()
# Scale Internet Service:
df['InternetService'] = (df['InternetService'] - df['InternetService'].mean())/df['InternetService'].std()

# Finally encoding Paperless Billing as a numerical variable:
df['PaperlessBilling'] = df['PaperlessBilling'].map({'No': 0, 'Yes': 1})

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   int64  
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   int64  
 4   Dependents        7032 non-null   int64  
 5   tenure            7032 non-null   float64
 6   PhoneService      7032 non-null   int64  
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   float64
 9   OnlineSecurity    7032 non-null   int64  
 10  OnlineBackup      7032 non-null   int64  
 11  DeviceProtection  7032 non-null   int64  
 12  TechSupport       7032 non-null   int64  
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   int64  


### It looks like there were actually some missing values hiding as strings in the 'TotalCharges' column. I am going to just drop those rows (only 11 rows) and incorporate 'TotalCharges' into the model:

In [31]:
df = df.dropna(how='any',axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   int64  
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   int64  
 4   Dependents        7032 non-null   int64  
 5   tenure            7032 non-null   float64
 6   PhoneService      7032 non-null   int64  
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   float64
 9   OnlineSecurity    7032 non-null   int64  
 10  OnlineBackup      7032 non-null   int64  
 11  DeviceProtection  7032 non-null   int64  
 12  TechSupport       7032 non-null   int64  
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   int64  


In [41]:
#Re-training the model (round 3) with TotalCharges this time. I will use pipelines the next time I work on this..:
Encoded_cols = ['gender', 'PaperlessBilling', 'Dependents', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Partner']
charges_cols = ['TotalCharges', 'MonthlyCharges']  ## added 'TotalCharges'

train_cols = charges_cols + ['tenure', 'SeniorCitizen'] + Encoded_cols
target = ['Churn']

X = df[train_cols]
y = df[target]

# There is a very high ratio of no-churn to churn, so we should use the 'stratify' argument when sampling the data to try to get a more representative sample
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [42]:
# Running round 3 with new train variables:
LogReg_model = LogisticRegression(random_state=0)
LogReg_model.fit(X_train,y_train.values.ravel()) 

LogisticRegression(random_state=0)

In [43]:
# Grabbing metrics for round 3 train predictions: 
# Train set metrics:
y_true_train = y_train
y_preds_train = LogReg_model.predict(X_train)

accuracy = accuracy_score(y_true_train,y_preds_train)
precision = precision_score(y_true_train,y_preds_train)
recall = recall_score(y_true_train,y_preds_train)

print("metrics on the train set: accuracy",accuracy,'precision',precision,'recall',recall)

metrics on the train set: accuracy 0.8000812677773262 precision 0.6519699812382739 recall 0.5313455657492355


In [44]:
# Test set metrics:
y_true = y_test
y_preds = LogReg_model.predict(X_test)

accuracy = accuracy_score(y_true,y_preds)
precision = precision_score(y_true,y_preds)
recall = recall_score(y_true,y_preds)

print('metrics on the test set: accuracy',accuracy,'precision',precision,'recall',recall)

metrics on the test set: accuracy 0.7971563981042654 precision 0.6455142231947484 recall 0.5258467023172906


## Looks like I am getting about ~80% accuracy for this churn prediction model. I would say this was about the 'kitchen sink approach', but there are still more variables I can try throwing in and maybe try to improve precision and recall.