In [56]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

In [14]:
#read in data set  "train"

churn_data = pd.read_csv('Churn_Data.csv')
churn_data.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,...,10,Sci-Fi,2.176498,4,Male,3,No,No,CB6SXPNVZA,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,...,18,Action,3.478632,8,Male,23,No,Yes,S7R2G87O09,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.39516,...,23,Fantasy,4.238824,6,Male,1,Yes,Yes,EASDC20BDT,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,...,30,Drama,4.276013,2,Male,24,Yes,Yes,NPF69NT69N,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,...,20,Comedy,3.61617,4,Female,0,No,No,4LGYPK7VOL,0


In [18]:
# Check data types
churn_data.dtypes

AccountAge                    int64
MonthlyCharges              float64
TotalCharges                float64
SubscriptionType             object
PaymentMethod                object
PaperlessBilling             object
ContentType                  object
MultiDeviceAccess            object
DeviceRegistered             object
ViewingHoursPerWeek         float64
AverageViewingDuration      float64
ContentDownloadsPerMonth      int64
GenrePreference              object
UserRating                  float64
SupportTicketsPerMonth        int64
Gender                       object
WatchlistSize                 int64
ParentalControl              object
SubtitlesEnabled             object
CustomerID                   object
Churn                         int64
dtype: object

In [22]:
# Check for missing values
missing_values = churn_data.isnull().sum()
missing_values

AccountAge                  0
MonthlyCharges              0
TotalCharges                0
SubscriptionType            0
PaymentMethod               0
PaperlessBilling            0
ContentType                 0
MultiDeviceAccess           0
DeviceRegistered            0
ViewingHoursPerWeek         0
AverageViewingDuration      0
ContentDownloadsPerMonth    0
GenrePreference             0
UserRating                  0
SupportTicketsPerMonth      0
Gender                      0
WatchlistSize               0
ParentalControl             0
SubtitlesEnabled            0
CustomerID                  0
Churn                       0
dtype: int64

In [40]:
# Dropped CustomerID and encoded objects to boolean
categorical_features = churn_data.select_dtypes(include = ['object']).columns
churn_data_encoded = churn_data.drop(columns=["CustomerID"])
churn_data_encoded = pd.get_dummies(churn_data_encoded, columns=categorical_features.drop('CustomerID'), drop_first=True)
churn_data_encoded.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,Churn,...,DeviceRegistered_Mobile,DeviceRegistered_TV,DeviceRegistered_Tablet,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Male,ParentalControl_Yes,SubtitlesEnabled_Yes
0,20,11.055215,221.104302,36.758104,63.531377,10,2.176498,4,3,0,...,True,False,False,False,False,False,True,True,False,False
1,57,5.175208,294.986882,32.450568,25.725595,18,3.478632,8,23,0,...,False,False,True,False,False,False,False,True,False,True
2,73,12.106657,883.785952,7.39516,57.364061,23,4.238824,6,1,0,...,False,False,False,False,False,True,False,True,True,True
3,32,7.263743,232.439774,27.960389,131.537507,30,4.276013,2,24,0,...,False,False,True,False,True,False,False,True,True,True
4,57,16.953078,966.325422,20.083397,45.356653,20,3.61617,4,0,0,...,False,True,False,True,False,False,False,False,False,False


In [44]:
# Check data types again
churn_data_encoded.dtypes

AccountAge                          int64
MonthlyCharges                    float64
TotalCharges                      float64
ViewingHoursPerWeek               float64
AverageViewingDuration            float64
ContentDownloadsPerMonth            int64
UserRating                        float64
SupportTicketsPerMonth              int64
WatchlistSize                       int64
Churn                               int64
SubscriptionType_Premium             bool
SubscriptionType_Standard            bool
PaymentMethod_Credit card            bool
PaymentMethod_Electronic check       bool
PaymentMethod_Mailed check           bool
PaperlessBilling_Yes                 bool
ContentType_Movies                   bool
ContentType_TV Shows                 bool
MultiDeviceAccess_Yes                bool
DeviceRegistered_Mobile              bool
DeviceRegistered_TV                  bool
DeviceRegistered_Tablet              bool
GenrePreference_Comedy               bool
GenrePreference_Drama             

In [52]:
# Scale numerical columns
numerical_features = churn_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_features.drop("Churn")

scaler = StandardScaler()
churn_data_encoded[numerical_columns] = scaler.fit_transform(churn_data_encoded[numerical_columns])
churn_data_encoded

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,Churn,...,DeviceRegistered_Mobile,DeviceRegistered_TV,DeviceRegistered_Tablet,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Male,ParentalControl_Yes,SubtitlesEnabled_Yes
0,-1.169131,-0.331703,-1.012550,1.445777,-0.568906,-1.005712,-0.715179,-0.175519,-1.253786,0,...,True,False,False,False,False,False,True,True,False,False
1,-0.089945,-1.690423,-0.871303,1.062671,-1.317459,-0.450971,0.411960,1.216976,1.526687,0,...,False,False,True,False,False,False,False,True,False,True
2,0.376731,-0.088741,0.254353,-1.165718,-0.691019,-0.104258,1.069988,0.520728,-1.531833,0,...,False,False,False,False,False,True,False,True,True,True
3,-0.819125,-1.207816,-0.990879,0.663322,0.777613,0.381141,1.102179,-0.871766,1.665711,0,...,False,False,True,False,True,False,False,True,True,True
4,-0.089945,1.031143,0.412150,-0.037246,-0.928765,-0.312285,0.531014,-0.175519,-1.670857,0,...,False,True,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243782,0.493400,-0.658746,-0.016190,-0.622520,-0.235555,1.559965,0.601371,-1.219890,-0.558668,0,...,False,False,False,False,False,False,True,True,True,False
243783,1.660088,0.129070,1.483585,0.396764,-0.642422,0.727854,-1.344265,-0.175519,1.109616,0,...,False,True,False,True,False,False,False,True,False,False
243784,1.543419,0.467666,1.700349,-0.878129,1.661653,1.351938,0.873836,0.520728,0.136451,0,...,False,True,False,False,False,False,False,True,True,True
243785,-1.548305,1.305539,-1.192488,0.886236,1.210218,0.797197,-0.750417,0.868852,-0.975739,0,...,False,True,False,False,False,True,False,False,False,True


In [64]:
# Split training and test sets (X, y)
X = churn_data_encoded.drop(columns=["Churn"])
y = churn_data_encoded['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [66]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((182840, 28), (60947, 28), (182840,), (60947,))