In [1]:
# imports
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
import utils

In [2]:
# Reading data
telco_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                         converters={
                             'gender': lambda x: int(x == 'Female'),
                             'Partner': lambda x: int(x == 'Yes'),
                             'Dependents': lambda x: int(x == 'Yes'),
                             'PhoneService': lambda x: int(x =='Yes'),
                             'MultipleLines': lambda x: int(x == 'Yes'),
                             'OnlineSecurity': lambda x: int(x == 'Yes'),
                             'OnlineBackup': lambda x: int(x == 'Yes'),
                             'DeviceProtection': lambda x: int(x == 'Yes'),
                             'TechSupport': lambda x: int(x == 'Yes'),
                             'StreamingTV': lambda x: int(x == 'Yes'),
                             'StreamingMovies': lambda x: int(x == 'Yes'),
                             'PaperlessBilling': lambda x: int(x =='Yes'),
                             'Churn': lambda x: int(x =='Yes'),
                             'MonthlyCharges': lambda x: float(x)
                         })

In [3]:
# Preprocessing
telco_data.drop('customerID', axis=1, inplace=True)
telco_data = telco_data.astype({
    'tenure': int,
    "MonthlyCharges": float,
    "TotalCharges": float
}, errors="ignore")

totalChargesMedian = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
telco_data['TotalCharges'].replace([' '], totalChargesMedian, regex=True, inplace=True)

columns_to_encode = ['InternetService', 'Contract', 'PaymentMethod']
for column in columns_to_encode:
    telco_data = utils.encode_and_bind(telco_data, column)

# Move final column for better visualization
telco_data.insert(len(telco_data.columns)-1, 'Churn', telco_data.pop('Churn'))

all_columns = list(telco_data.columns)
telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

In [4]:
display(telco_data.iloc[487:494, :])
telco_data.to_csv('telco.csv')

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,InternetService_DSL,InternetService_Fiber optic,Contract_Month-to-month,Contract_One year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,Churn
487,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.7,0.713495,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
488,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.341294,0.15909,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
489,0.0,0.0,0.0,1.0,0.013889,1.0,1.0,0.0,0.0,0.0,...,0.558209,0.00641,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
490,0.0,0.0,1.0,1.0,0.875,1.0,1.0,0.0,1.0,1.0,...,0.861194,0.75911,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
491,1.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.0,1.0,0.0,...,0.405473,0.011003,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
492,1.0,0.0,1.0,0.0,0.027778,1.0,0.0,1.0,0.0,0.0,...,0.558706,0.013916,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
493,0.0,1.0,1.0,0.0,0.847222,1.0,1.0,0.0,1.0,0.0,...,0.455721,0.448165,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [5]:
print(telco_data.shape)


(7043, 24)
