In [1]:
import statistics
import numpy as np
import pandas as pd

In [2]:

training_set = pd.read_csv("train.csv")
testing_set = pd.read_csv('test.csv')
columns = training_set.columns
columns


Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
testing_set.isnull().sum()


gender                0
SeniorCitizen         0
Partner               0
Dependents            0
tenure                0
PhoneService          0
MultipleLines         0
InternetService       0
OnlineSecurity        0
OnlineBackup          0
DeviceProtection      0
TechSupport           0
StreamingTV           0
StreamingMovies       0
Contract              0
PaperlessBilling      0
PaymentMethod         0
MonthlyCharges      299
TotalCharges          0
dtype: int64

In [4]:
# preprocess missing values and label encoding..

# replace string labels with integers.
training_set['Churn'] = training_set['Churn'].replace({'Yes': 1, 'No': 0})
#

In [5]:
# # to numerical float64
training_set["tenure"] = training_set["tenure"].astype(np.float64)
testing_set["tenure"] = testing_set["tenure"].astype(np.float64)
#

In [6]:
# # replace missing values by the mean of the column, there might more advanced method but we are running out of time.
training_set["MonthlyCharges"] = training_set["MonthlyCharges"].fillna(round(training_set["MonthlyCharges"].mean(),2))
testing_set["MonthlyCharges"] = testing_set["MonthlyCharges"].fillna(round(testing_set["MonthlyCharges"].mean(),2))

In [7]:
# finding what is the problem with this column, unable to change it automatiaclly to float, it turned out there are empty
# string instead of floats
total_charges = training_set["TotalCharges"].tolist()
values = []
for tc in total_charges:
    if tc is not " ":
      values.append(float(tc))

# replace empty string with floats as the mean of the column:
training_set["TotalCharges"] = training_set["TotalCharges"].replace({' ': round(statistics.mean(values),2)})
training_set["TotalCharges"] = training_set["TotalCharges"].astype(np.float64)
#
#
total_charges_test = testing_set["TotalCharges"].tolist()
testing_values = []
for tc in total_charges_test:
    if tc is not " ":
      testing_values.append(float(tc))


testing_set["TotalCharges"] = testing_set["TotalCharges"].replace({" ": round(statistics.mean(testing_values),2)})
testing_set["TotalCharges"] = testing_set["TotalCharges"].astype(np.float64)


In [8]:
columns = training_set.columns

num_columns = training_set._get_numeric_data().columns

categorical_columns = list(set(columns) - set(num_columns))


# # use get dummies to encode categorical features, there might another approach but this is the simplest,
# # and according to my experience, different encoding strategies does influence the final model in binary classification
# # scenarios.
encoded_data = pd.get_dummies(training_set, columns=categorical_columns)



In [9]:
columns = testing_set.columns
num_columns = testing_set._get_numeric_data().columns
categorical_columns = list(set(columns) - set(num_columns))

encoded_test_data = pd.get_dummies(testing_set,  columns=categorical_columns)


In [10]:
encoded_test_data.describe()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Dependents_No,Dependents_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,OnlineSecurity_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes
count,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,...,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0,2831.0
mean,0.158954,32.980219,64.458244,2309.111243,0.714235,0.285765,0.400565,0.218651,0.380784,0.492052,...,0.230661,0.210173,0.329566,0.229601,0.429177,0.218651,0.352172,0.392794,0.218651,0.388555
std,0.365698,24.785931,28.516302,2280.649254,0.451858,0.451858,0.4901,0.413404,0.485665,0.500025,...,0.42133,0.407503,0.470138,0.42065,0.495046,0.413404,0.477732,0.488458,0.413404,0.487508
min,0.0,0.0,18.8,18.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.0,44.325,402.275,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,30.0,64.46,1411.9,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,56.0,87.775,3886.25,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
max,1.0,72.0,118.6,8684.8,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
encoded_test_data.isnull().sum()

SeniorCitizen                              0
tenure                                     0
MonthlyCharges                             0
TotalCharges                               0
Dependents_No                              0
Dependents_Yes                             0
StreamingTV_No                             0
StreamingTV_No internet service            0
StreamingTV_Yes                            0
OnlineSecurity_No                          0
OnlineSecurity_No internet service         0
OnlineSecurity_Yes                         0
MultipleLines_No                           0
MultipleLines_No phone service             0
MultipleLines_Yes                          0
OnlineBackup_No                            0
OnlineBackup_No internet service           0
OnlineBackup_Yes                           0
PaperlessBilling_No                        0
PaperlessBilling_Yes                       0
Partner_No                                 0
Partner_Yes                                0
TechSuppor

In [12]:
# # store for train models..
# encoded_data.to_csv("preprocessed_data3.csv",index=False)

encoded_test_data.to_csv("preprocessed_test.csv",index=False)
encoded_test_data

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Dependents_No,Dependents_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,OnlineSecurity_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes
0,0,72.0,97.75,6991.60,0,1,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
1,0,57.0,74.60,4368.95,1,0,0,0,1,1,...,1,0,0,0,0,0,1,1,0,0
2,1,46.0,69.10,3255.35,1,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,1
3,0,38.0,101.15,3956.70,0,1,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
4,1,32.0,85.95,2628.60,1,0,0,0,1,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826,0,47.0,100.75,4669.20,1,0,0,0,1,0,...,0,0,0,1,1,0,0,1,0,0
2827,1,47.0,67.45,3252.00,1,0,1,0,0,0,...,0,1,0,0,0,0,1,1,0,0
2828,0,47.0,40.30,1794.80,0,1,1,0,0,0,...,0,0,1,0,0,0,1,1,0,0
2829,0,71.0,47.60,3377.80,1,0,1,0,0,0,...,1,0,0,0,0,0,1,1,0,0


In [13]:
encoded_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'Dependents_No', 'Dependents_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'Partner_No', 'Partner_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'gender_Female', 'gender_Male', 'PhoneService_No', 'PhoneService_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check'

In [14]:
encoded_test_data.columns


Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'Dependents_No', 'Dependents_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'Partner_No', 'Partner_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'gender_Female', 'gender_Male', 'PhoneService_No', 'PhoneService_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'Paymen