In [38]:
# Importing packages
import numpy as np
import pandas as pd 

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [39]:
# Load dataset.
telco_data = pd.read_csv('data/telco_customer_churn.csv')

# Checking dataframe.
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [40]:
# Cleaning up columns names and categorical variables. 
columns = telco_data.columns.str.lower().str.replace(' ', '_')

# changeing columns names 
telco_data.columns = columns

# Checking for changes made. 
telco_data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [41]:
# Categorical Features. 
categorical = ['customerid', 'gender', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod',
       'churn']

# Converting to lower case and removing space. 
for c in categorical: 
    telco_data[c] = telco_data[c].str.lower().str.replace(' ', '_')
    print(c)

customerid
gender
partner
dependents
phoneservice
multiplelines
internetservice
onlinesecurity
onlinebackup
deviceprotection
techsupport
streamingtv
streamingmovies
contract
paperlessbilling
paymentmethod
churn


In [42]:
# Checking dataframe.
telco_data.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [43]:
# Converting totalcharge column to number. 
telco_data['totalcharges'] = pd.to_numeric(telco_data['totalcharges'], errors='coerce')

# Filling null values with 0. 
telco_data['totalcharges'] = telco_data['totalcharges'].fillna(0)

# checking for null. 
telco_data['totalcharges'].isna().sum()

0

In [44]:
# converting churn to int
telco_data['churn'] = (telco_data['churn']== 'yes').astype(int)

# Checking churn features.
telco_data.churn.value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [58]:
# Setting up data validation dataset. 
telco_train, telco_test = train_test_split(telco_data, test_size = 0.20, random_state = 1)

# Creating y features. 
y_train = telco_train['churn']
y_test = telco_test['churn']

# Deleting churn from dataset. 
del telco_train['churn']
del telco_test['churn']

In [59]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 5634 entries, 1814 to 5157
Series name: churn
Non-Null Count  Dtype
--------------  -----
5634 non-null   int32
dtypes: int32(1)
memory usage: 66.0 KB


In [60]:
# Defining a training function. 
def train(df_train, df_y, c = 1.0):
    model = LogisticRegression(C= c , max_iter = 1000)
    dv = DictVectorizer(sparse = False)
    
    dict_train = df_train.to_dict(orient = 'records')
    x_train = dv.fit_transform(dict_train)
    
    model.fit(x_train, df_y.values)
    
    return model, dv

In [61]:
# Defining prediction function. 
def predict(df_val, model, dv): 
    x_val = dv.transform(df_val)
    
    y_pred = model.predict_proba(x_val)[:, 1]
    
    return y_pred

In [62]:
# Importing Cross validation package kfold.
from sklearn.model_selection import KFold

In [63]:
# Setting parameters.

# Kfold parameter.
n_splits = 5

# Logistic regression parameter.
c = 0.1

In [64]:
KFold.split?

In [65]:
# Setting up Kfold. 
kf = KFold(n_splits = n_splits, shuffle= True, random_state= 4)

In [66]:
from tqdm.auto import tqdm

In [67]:
score = []

for train_idx, test_idx in tqdm(kf.split(telco_train), total = n_splits):
    df_train = telco_train.iloc[train_idx]
    df_val = telco_train.iloc[test_idx]
    
    y_train = y_train.iloc[train_idx]
    y_val = y_train.iloc[test_idx]
    
    model, dv = train(df_train, y_train,C=c)
    y_pred = predict(df_val, model, dv)
    
    auc = roc_auc_score(y_pred, y_test)
    
    score.append(auc)
    

  0%|          | 0/5 [00:00<?, ?it/s]

IndexError: positional indexers are out-of-bounds