In [1]:
from fastai.tabular import *

In [2]:
path = Path('.datasets/telco')
dataSet = pd.read_csv(path/'Telco-Customer-Churn.csv')
dataSet.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
print ("Rows     : " ,dataSet.shape[0])
print ("Columns  : " ,dataSet.shape[1])
print ("\nFeatures : \n" ,dataSet.columns.tolist())
print ("\nMissing values :  ", dataSet.isnull().sum().values.sum())
print ("\nUnique values :  \n",dataSet.nunique())

Rows     :  7043
Columns  :  21

Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [4]:
# dataSet = dataSet.drop(columns='customerID')

#Replacing spaces with null values in total charges column
dataSet['TotalCharges'] = dataSet["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
dataSet = dataSet[dataSet["TotalCharges"].notnull()]
dataSet = dataSet.reset_index()[dataSet.columns]

#convert to float type
dataSet["TotalCharges"] = dataSet["TotalCharges"].astype(float)

In [5]:
dataSet.tail()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7027,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7028,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7029,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7030,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7031,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
dependentVar = 'Churn'
categoricalNames = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                   'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
continuousNames = ['tenure', 'MonthlyCharges', 'TotalCharges']
processors = [FillMissing, Categorify, Normalize]

In [7]:
test = TabularList.from_df(dataSet.iloc[800:1000].copy(), path=path, cat_names=categoricalNames, cont_names=continuousNames)

In [8]:
data = (TabularList.from_df(dataSet, path=path, cat_names=categoricalNames, cont_names=continuousNames, procs=processors)
                            .split_by_idx(list(range(800,1000)))
                            .label_from_df(cols=dependentVar)
                            .add_test(test, label=0)
                            .databunch())

In [9]:
data.show_batch(rows=10)

gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,tenure,MonthlyCharges,TotalCharges,target
Male,0,No,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,-1.2805,0.3521,-0.9738,Yes
Female,1,No,No,Yes,No,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Electronic check,-1.1174,0.6647,-0.8336,No
Female,0,Yes,No,No,No phone service,DSL,Yes,Yes,Yes,Yes,No,No,Two year,Yes,Bank transfer (automatic),1.5727,-0.5693,0.4851,No
Male,0,Yes,Yes,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Credit card (automatic),-0.2207,-0.3314,-0.3399,No
Male,0,No,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,-1.1174,-1.4723,-0.9598,No
Male,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,-1.1174,0.6581,-0.8066,Yes
Male,0,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Electronic check,0.7575,-1.5172,-0.5618,No
Female,0,Yes,No,Yes,No,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,-0.5468,1.0672,-0.2371,No
Female,1,Yes,No,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),1.1651,1.32,1.8223,Yes
Female,0,No,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),-0.7098,-1.3193,-0.844,No


In [10]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [11]:
learn.fit(1,1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.435601,0.410086,0.81,00:01


In [13]:
#Inference
row = dataSet.iloc[46]
learn.predict(row)

(Category No, tensor(0), tensor([0.8255, 0.1745]))

In [14]:
dataSet.iloc[46]

customerID              5948-UJZLF
gender                        Male
SeniorCitizen                    0
Partner                         No
Dependents                      No
tenure                           2
PhoneService                   Yes
MultipleLines                   No
InternetService                DSL
OnlineSecurity                  No
OnlineBackup                   Yes
DeviceProtection                No
TechSupport                     No
StreamingTV                     No
StreamingMovies                 No
Contract            Month-to-month
PaperlessBilling                No
PaymentMethod         Mailed check
MonthlyCharges               49.25
TotalCharges                    97
Churn                           No
Name: 46, dtype: object