In [248]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import warnings
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
pd.set_option('display.max_column' , None)
warnings.simplefilter(action='ignore',category=Warning)
pd.options.display.float_format = '{:,.3f}'.format

In [135]:
df = pd.read_csv("../../datasets/Churn/Telco-Customer-Churn.csv")

In [136]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162,0.369,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371,24.559,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.762,30.09,18.25,35.5,70.35,89.85,118.75


In [40]:
def grab_col_names(data_frame, cat_th=25 , car_th=30): 
    cat_cols = [col for col in data_frame.columns if data_frame[col].dtype == "O"]
    num_but_cat = [col for col in data_frame.columns if data_frame[col].nunique() < car_th and data_frame[col].dtype != "O"]
    cat_but_car = [col for col in data_frame.columns if data_frame[col].nunique() > car_th and data_frame[col].dtype == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in data_frame.columns if data_frame[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {data_frame.shape[0]}")
    print(f"Variables: {data_frame.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"num_but_car: {len(num_but_cat)}")
    return num_cols,cat_cols,cat_but_car

def missing_values_table(data_frame, na_name=False):
    na_columns = [col for col in data_frame.columns if data_frame[col].isnull().sum() > 0]

    n_miss = data_frame[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (data_frame[na_columns].isnull().sum() / data_frame.shape[0] * 100)
    missing_df = pd.concat([n_miss, np.round(ratio,2)],axis=1, keys=["n_miss" , "ratio"])
    print(missing_df,end="\n")

    if na_name:
        return na_columns

def locate_uneven_cols(dataframe, col):
    total_values = dataframe[col].value_counts().sum()
    val_counts = dataframe[col].value_counts() 
    for count in val_counts:
        percentage = (count / total_values) * 100
        if percentage > 95:
            return True
    return False

In [11]:
num_cols , cat_cols, cat_but_car = grab_col_names(df)

Observations: 7043
Variables: 21
cat_cols: 17
cat_but_car: 2
num_cols: 2
num_but_car: 1


In [12]:
cat_cols

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn',
 'SeniorCitizen']

In [121]:
bool_cols = [col for col in df.columns if (df[col].dtype == "O") & (df[col].nunique() == 2)]
bool_cols

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'PaperlessBilling',
 'Churn']

In [24]:
missing_values_table(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


In [43]:
for col in df.columns:
    if df[col].nunique() == 2:
        print("#######",end="\n\n")
        print(f"{col} = boolean")
    else:
        print("#######",end="\n\n")
        
        print(f"{col} = NOT BOOLEAN , count : {df[col].nunique()}")

#######

customerID = NOT BOOLEAN , count : 7043
#######

gender = boolean
#######

SeniorCitizen = boolean
#######

Partner = boolean
#######

Dependents = boolean
#######

tenure = NOT BOOLEAN , count : 73
#######

PhoneService = boolean
#######

MultipleLines = NOT BOOLEAN , count : 3
#######

InternetService = NOT BOOLEAN , count : 3
#######

OnlineSecurity = NOT BOOLEAN , count : 3
#######

OnlineBackup = NOT BOOLEAN , count : 3
#######

DeviceProtection = NOT BOOLEAN , count : 3
#######

TechSupport = NOT BOOLEAN , count : 3
#######

StreamingTV = NOT BOOLEAN , count : 3
#######

StreamingMovies = NOT BOOLEAN , count : 3
#######

Contract = NOT BOOLEAN , count : 3
#######

PaperlessBilling = boolean
#######

PaymentMethod = NOT BOOLEAN , count : 4
#######

MonthlyCharges = NOT BOOLEAN , count : 1585
#######

TotalCharges = NOT BOOLEAN , count : 6531
#######

Churn = boolean


In [45]:
df["OnlineSecurity"].value_counts()

OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64

In [41]:
uneven_cols = []
for col in df.columns:
    bool = locate_uneven_cols(df,col)
    if bool:
        uneven_cols.append(col)

In [42]:
uneven_cols

[]

In [46]:
df["Churn"].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [48]:
for col in df.columns:
    if df[col].nunique() < 10:
        print("########################", end="\n\n\n")
        print(f"{col} value counts : {df[col].value_counts()} ")

########################


gender value counts : gender
Male      3555
Female    3488
Name: count, dtype: int64 
########################


SeniorCitizen value counts : SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64 
########################


Partner value counts : Partner
No     3641
Yes    3402
Name: count, dtype: int64 
########################


Dependents value counts : Dependents
No     4933
Yes    2110
Name: count, dtype: int64 
########################


PhoneService value counts : PhoneService
Yes    6361
No      682
Name: count, dtype: int64 
########################


MultipleLines value counts : MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64 
########################


InternetService value counts : InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64 
########################


OnlineSecurity value counts : OnlineSecurity
No                    

In [57]:
"No internet service" in df["StreamingMovies"].unique()

True

In [137]:
for col in df.columns:
    if "No internet service" in df[col].unique():
        df[col] = df[col].replace({"No internet service":"No"})  

In [60]:
for col in df.columns:
    if df[col].nunique() < 10:
        print("########################", end="\n\n\n")
        print(f"{col} value counts : {df[col].value_counts()} ")

########################


gender value counts : gender
Male      3555
Female    3488
Name: count, dtype: int64 
########################


SeniorCitizen value counts : SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64 
########################


Partner value counts : Partner
No     3641
Yes    3402
Name: count, dtype: int64 
########################


Dependents value counts : Dependents
No     4933
Yes    2110
Name: count, dtype: int64 
########################


PhoneService value counts : PhoneService
Yes    6361
No      682
Name: count, dtype: int64 
########################


MultipleLines value counts : MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64 
########################


InternetService value counts : InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64 
########################


OnlineSecurity value counts : OnlineSecurity
No     5024
Yes    201

In [138]:
for col in df.columns:
    if "No phone service" in df[col].unique():
        df[col] = df[col].replace({"No phone service":"No"})  

In [146]:
df.drop(["customerID"],axis=1, inplace=True)

In [148]:
num_col, cat_col, cat_but_car = grab_col_names(df)

Observations: 7043
Variables: 20
cat_cols: 17
cat_but_car: 0
num_cols: 3
num_but_car: 1


In [147]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [128]:
num_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [171]:
df["SeniorCitizen"] = df["SeniorCitizen"].map({False: "No", True: "Yes"})

In [120]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [111]:
label_encoder = LabelEncoder()

In [174]:
def label_encode(le, df, col, out=False):
    df[col] = le.fit_transform(df[col])
    if out:
        print(f"{col} label encoded")

In [114]:
def one_hot_encode(data_frame, categorical_cols, drop_first=True):
    data_frame = pd.get_dummies(data_frame, columns=categorical_cols,drop_first=drop_first)
    return data_frame

In [157]:
for col in df.columns:
    print(f"{col} : {df[col].nunique()}")

gender : 2
SeniorCitizen : 2
Partner : 2
Dependents : 2
tenure : 73
PhoneService : 2
MultipleLines : 2
InternetService : 3
OnlineSecurity : 2
OnlineBackup : 2
DeviceProtection : 2
TechSupport : 2
StreamingTV : 2
StreamingMovies : 2
Contract : 3
PaperlessBilling : 2
PaymentMethod : 4
MonthlyCharges : 1585
TotalCharges : 6530
Churn : 2


In [164]:
ohe_cols = []
label_encode_cols = []
numeric_cols = []
for col in df.columns:
    if df[col].nunique() < 3:
       label_encode_cols.append(col)
    elif df[col].nunique() > 2 and df[col].nunique() < 11:
        ohe_cols.append(col)
    else:
        numeric_cols.append(col)

['InternetService', 'Contract', 'PaymentMethod']
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
['tenure', 'MonthlyCharges', 'TotalCharges']


In [165]:
ohe_cols

['InternetService', 'Contract', 'PaymentMethod']

In [169]:
label_encode_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling']

In [167]:
numeric_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [172]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [173]:
dff = df.copy()

In [175]:
for col in dff.columns:
    label_encode(label_encoder, dff, col, True)

gender label encoded
SeniorCitizen label encoded
Partner label encoded
Dependents label encoded
tenure label encoded
PhoneService label encoded
MultipleLines label encoded
InternetService label encoded
OnlineSecurity label encoded
OnlineBackup label encoded
DeviceProtection label encoded
TechSupport label encoded
StreamingTV label encoded
StreamingMovies label encoded
Contract label encoded
PaperlessBilling label encoded
PaymentMethod label encoded
MonthlyCharges label encoded
TotalCharges label encoded
Churn label encoded


In [212]:
dff

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,InternetService_1,InternetService_2,Contract_1,Contract_2,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3
0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,142,74,0,False,False,False,False,False,True,False
1,1,0,0,0,34,1,0,1,0,1,0,0,0,0,498,3624,0,False,False,True,False,False,False,True
2,1,0,0,0,2,1,0,1,1,0,0,0,0,1,436,536,1,False,False,False,False,False,False,True
3,1,0,0,0,45,0,0,1,0,1,1,0,0,0,266,3570,0,False,False,True,False,False,False,False
4,0,0,0,0,2,1,0,0,0,0,0,0,0,1,729,674,1,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,1,1,0,1,1,1,1,1,991,3700,0,False,False,True,False,False,False,True
7039,0,0,1,1,72,1,1,0,1,1,0,1,1,1,1340,6304,0,True,False,True,False,True,False,False
7040,0,0,1,1,11,0,0,1,0,0,0,0,0,1,137,1265,0,False,False,False,False,False,True,False
7041,1,1,1,0,4,1,1,0,0,0,0,0,0,1,795,1157,1,True,False,False,False,False,False,True


In [203]:
dff = one_hot_encode(dff, ohe_cols)

In [179]:
def check_outliers(data_frame, col_name):
    up_limit,low_limit = outliers_treshold(data_frame,col_name)
    if data_frame[(data_frame[col_name] > up_limit) | (data_frame[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def outliers_treshold(data_frame, col_name, q1=0.25 ,q3=0.75):
    quartile1 = data_frame[col_name].quantile(q1)
    quartile3 = data_frame[col_name].quantile(q3)
    interquartile_range = quartile3-quartile1
    up_limit = quartile3 + 1.5 * interquartile_range
    low_limit = quartile1 - 1.5 * interquartile_range
    return up_limit, low_limit

def grab_outliers(data_frame,col_name,index=False):
    up,low = outliers_treshold(data_frame,col_name)
    # if data_frame[((data_frame[col_name] < low) | (data_frame[col_name] > up))].shape[0] > 10:
    #     print(data_frame[((data_frame[col_name] < low) | (data_frame[col_name] > up))].head())
    # else:
    #     print(data_frame[((data_frame[col_name] < low) | (data_frame[col_name] > up))])
    if index:
        outlier_index = data_frame[((data_frame[col_name] < low) | (data_frame[col_name] > up))].index
        return outlier_index

In [181]:
outliers = []
for col in dff.columns:
    if check_outliers(dff,col):
        print(col)
        outliers.append(col)

SeniorCitizen
PhoneService


In [184]:
ss = StandardScaler()

In [204]:
df3 = dff.copy()

In [205]:
X = df3.drop(["Churn"], axis=1)
y = df3["Churn"]

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [207]:
X_train_scaled = ss.fit_transform(X_train)

In [208]:
X_test_scaled = ss.transform(X_test)

In [209]:
X_train_scaled.shape

(5634, 23)

# Logistic regression Standart scaled

In [230]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

predictions = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")


Accuracy: 0.8126330731014905


In [231]:
cv_results = cross_validate(log_reg, X_train_scaled, y_train, cv=10, scoring=["accuracy" , "f1" , "roc_auc"])


In [232]:
print(f"accuracy: {cv_results['test_accuracy'].mean()}")
print(f"f1: {cv_results['test_f1'].mean()}")
print(f"roc_auc: {cv_results['test_roc_auc'].mean()}")

accuracy: 0.8049270624692945
f1: 0.5799320966555255
roc_auc: 0.8432064243087769


In [233]:
rs = RobustScaler()

X_train_robust_scaled = rs.fit_transform(X_train)
X_test_robust_scaled = rs.fit_transform(X_test)

# Logisctic Regression robust scaled

In [234]:
log_reg = LogisticRegression()
log_reg.fit(X_train_robust_scaled, y_train)

predictions = log_reg.predict(X_test_robust_scaled)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

Accuracy: 0.8133427963094393


In [235]:
cv_results = cross_validate(log_reg, X_train_robust_scaled, y_train, cv=10, scoring=["accuracy" , "f1" , "roc_auc"])
print(f"accuracy: {cv_results['test_accuracy'].mean()}")
print(f"f1: {cv_results['test_f1'].mean()}")
print(f"roc_auc: {cv_results['test_roc_auc'].mean()}")

accuracy: 0.8058135872919895
f1: 0.5829407515490971
roc_auc: 0.8432737672165447
