In [1]:
from fastai.tabular import *

## NN with fastai

In [2]:
path = Path('.datasets/telco')
dataSet = pd.read_csv(path/'Telco-Customer-Churn.csv')
dataSet.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
print ("Rows     : " ,dataSet.shape[0])
print ("Columns  : " ,dataSet.shape[1])
print ("\nFeatures : \n" ,dataSet.columns.tolist())
print ("\nMissing values :  ", dataSet.isnull().sum().values.sum())
print ("\nUnique values :  \n",dataSet.nunique())

Rows     :  7043
Columns  :  21

Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [4]:
dataSet = dataSet.drop(columns='customerID')

#Replacing spaces with null values in total charges column
dataSet['TotalCharges'] = dataSet["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
dataSet = dataSet[dataSet["TotalCharges"].notnull()]
dataSet = dataSet.reset_index()[dataSet.columns]

#convert to float type
dataSet["TotalCharges"] = dataSet["TotalCharges"].astype(float)

In [5]:
dataSet.tail()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7027,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7028,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7029,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7030,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7031,Male,0,No,No,66,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
dependentVar = 'Churn'
categoricalNames = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                   'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
continuousNames = ['tenure', 'MonthlyCharges', 'TotalCharges']
processors = [FillMissing, Categorify, Normalize]

In [7]:
test = TabularList.from_df(dataSet.iloc[800:1000].copy(), path=path, cat_names=categoricalNames, cont_names=continuousNames)

In [8]:
data = (TabularList.from_df(dataSet, path=path, cat_names=categoricalNames, cont_names=continuousNames, procs=processors)
                            .split_by_idx(list(range(800,1000)))
                            .label_from_df(cols=dependentVar)
                            .add_test(test, label=0)
                            .databunch())

In [9]:
data.show_batch(rows=10)

gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,tenure,MonthlyCharges,TotalCharges,target
Male,0,No,No,No,No phone service,DSL,No,No,Yes,No,Yes,No,Month-to-month,Yes,Mailed check,-1.199,-0.8171,-0.9493,Yes
Male,0,Yes,Yes,No,No phone service,DSL,No,No,Yes,Yes,No,Yes,Two year,No,Credit card (automatic),0.8798,-0.6225,0.0638,No
Male,0,Yes,No,Yes,No,DSL,Yes,No,Yes,No,No,No,Month-to-month,No,Mailed check,-1.2805,-0.2998,-0.9824,No
Female,0,Yes,No,Yes,No,DSL,No,Yes,No,No,No,Yes,Two year,Yes,Credit card (automatic),1.4912,-0.1086,0.8179,No
Male,0,No,No,Yes,Yes,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),0.1869,1.1337,0.5283,Yes
Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,No,Electronic check,-1.2805,0.3188,-0.9742,No
Male,0,Yes,Yes,No,No phone service,DSL,Yes,Yes,No,No,Yes,No,One year,No,Mailed check,1.4504,-0.6624,0.311,No
Female,0,No,No,Yes,Yes,DSL,No,No,No,Yes,Yes,No,Month-to-month,No,Mailed check,0.0646,0.0095,-0.054,No
Female,0,No,No,Yes,Yes,DSL,Yes,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,-0.1392,0.0461,-0.1624,No
Male,0,No,No,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,One year,No,Electronic check,-0.18,1.1803,0.1806,No


In [10]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [11]:
learn.fit(2,1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.43728,0.419066,0.8,00:00
1,0.429419,0.437677,0.775,00:01


In [12]:
#Inference
row = dataSet.iloc[46]
learn.predict(row)

(Category No, tensor(0), tensor([0.6949, 0.3051]))

In [13]:
dataSet.iloc[46]

gender                        Male
SeniorCitizen                    0
Partner                         No
Dependents                      No
tenure                           2
PhoneService                   Yes
MultipleLines                   No
InternetService                DSL
OnlineSecurity                  No
OnlineBackup                   Yes
DeviceProtection                No
TechSupport                     No
StreamingTV                     No
StreamingMovies                 No
Contract            Month-to-month
PaperlessBilling                No
PaymentMethod         Mailed check
MonthlyCharges               49.25
TotalCharges                    97
Churn                           No
Name: 46, dtype: object

## Improving Data Manipulation

In [14]:
df = pd.read_csv(path/'Telco-Customer-Churn.csv')

In [15]:
df = df.drop(columns='customerID')

#Replacing spaces with null values in total charges column
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
df = df[df["TotalCharges"].notnull()]
df = df.reset_index()[df.columns]

#convert to float type
df["TotalCharges"] = df["TotalCharges"].astype(float)

#replace 'No internet service' to No for the following columns
colsToReplace = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in colsToReplace : 
    df[i]  = df[i].replace({'No internet service' : 'No'})
    
#replace values
df["SeniorCitizen"] = df["SeniorCitizen"].replace({1:"Yes",0:"No"})

In [16]:
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=categoricalNames, cont_names=continuousNames)
data = (TabularList.from_df(df, path=path, cat_names=categoricalNames, cont_names=continuousNames, procs=processors)
                            .split_by_idx(list(range(800,1000)))
                            .label_from_df(cols=dependentVar)
                            .add_test(test, label=0)
                            .databunch())

In [17]:
learn2 = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [22]:
learn2.fit(1,1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.413034,0.412248,0.815,00:00
