In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

df_churn = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
df_churn.drop('customerID', axis=1, inplace=True, errors='ignore')

In [5]:
df_churn['TotalCharges'] = pd.to_numeric(df_churn['TotalCharges'], errors='coerce')
df_churn['TotalCharges'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_churn['TotalCharges'].fillna(0, inplace=True)


In [6]:
target = df_churn['Churn']
features = df_churn.drop('Churn', axis=1)

In [7]:
for column in features.columns:
    if features[column].dtype == 'object':
        le = LabelEncoder()
        features[column] = le.fit_transform(features[column].astype(str))

target = LabelEncoder().fit_transform(target)

In [8]:
scaler = StandardScaler()
features[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(features[['MonthlyCharges', 'TotalCharges']])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [11]:
lr.score(X_test, y_test)

0.8161816891412349

In [12]:
cross_val_score(lr, features, target, cv=3).mean()

np.float64(0.8030670685957909)

In [13]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

In [15]:
svm.score(X_test, y_test)

0.7955997161107168

In [16]:
cross_val_score(svm, features, target, cv=3).mean()

np.float64(0.7936967873978332)

In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [18]:
dt.score(X_test, y_test)

0.7331440738112136

In [19]:
cross_val_score(svm, features, target, cv=3).mean()

np.float64(0.7936967873978332)

In [20]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [21]:
dt.score(X_test, y_test)

0.7345635202271115

In [22]:
cross_val_score(dt, features, target, cv=3).mean()

np.float64(0.7273896116370725)

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [24]:
rf.score(X_test, y_test)

0.794180269694819

In [25]:
cross_val_score(rf, features, target, cv=3).mean()

np.float64(0.787732874884438)

In [26]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

In [27]:
nb.score(X_test, y_test)

0.7579843860894251

In [28]:
cross_val_score(nb, features, target, cv=3).mean()

np.float64(0.7519532710212538)