In [118]:
# imports
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import entropy
from numpy.random import uniform
import json

In [119]:
# h function
def h_func(df):
    p = len(df[df['Churn'] == 1.0])
    n = df.shape[0] - p
    q = p / (p+n)
    h = entropy([q, 1-q], base=2)
    return p, n, h

In [120]:
# information gain
def info_gain(X):
    p, n, h_output = h_func(X)
    all_features = list(telco_data.columns)
    feature_gain_map = dict()
    for feature in all_features:
        remainder = 0
        if feature == 'Churn':
            continue
        values = X[feature].unique()
        if len(values) > 2:
            col_min = X[feature].min()
            col_max = X[feature].max()
            divider = uniform(col_min, col_max)
            # divider = X[feature].median()
            groups = [X[X[feature] <= divider], X[X[feature] > divider]]
            for group in groups:
                pk, nk, h_k = h_func(group)
                remainder += ((pk + nk) / (p + n)) * h_k
        else:
            for _, group in X.groupby([feature]):
                pk, nk, h_k = h_func(group)
                remainder += ((pk + nk) / (p + n)) * h_k
        feature_gain_map[feature] = h_output - remainder

    print(json.dumps(feature_gain_map, indent=4))


In [121]:
# One-hot encode
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    dummies = dummies.iloc[:, :-1]
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res
    

In [122]:
# Reading data
telco_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                         converters={
                             'gender': lambda x: int(x == 'Female'),
                             'Partner': lambda x: int(x == 'Yes'),
                             'Dependents': lambda x: int(x == 'Yes'),
                             'PhoneService': lambda x: int(x =='Yes'),
                             'MultipleLines': lambda x: int(x == 'Yes'),
                             'OnlineSecurity': lambda x: int(x == 'Yes'),
                             'OnlineBackup': lambda x: int(x == 'Yes'),
                             'DeviceProtection': lambda x: int(x == 'Yes'),
                             'TechSupport': lambda x: int(x == 'Yes'),
                             'StreamingTV': lambda x: int(x == 'Yes'),
                             'StreamingMovies': lambda x: int(x == 'Yes'),
                             'PaperlessBilling': lambda x: int(x =='Yes'),
                             'Churn': lambda x: int(x =='Yes'),
                             'MonthlyCharges': lambda x: float(x)
                         })

In [123]:
# Preprocessing
telco_data.drop('customerID', axis=1, inplace=True)
telco_data = telco_data.astype({
    'tenure': int,
    "MonthlyCharges": float,
    "TotalCharges": float
}, errors="ignore")

totalChargesMedian = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
telco_data['TotalCharges'].replace([' '], totalChargesMedian, regex=True, inplace=True)

columns_to_encode = ['InternetService', 'Contract', 'PaymentMethod']
for column in columns_to_encode:
    telco_data = encode_and_bind(telco_data, column)

# Move final column for better visualization

all_columns = list(telco_data.columns)
telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

info_gain(telco_data)

{
    "gender": 5.349933671350282e-05,
    "SeniorCitizen": 0.015259766252591622,
    "Partner": 0.01652413451940382,
    "Dependents": 0.02087184590109281,
    "tenure": 0.07283135493765436,
    "PhoneService": 0.00010410414107420163,
    "MultipleLines": 0.0011555536142534573,
    "OnlineSecurity": 0.02291681174104332,
    "OnlineBackup": 0.0049917440437087235,
    "DeviceProtection": 0.003213641195506378,
    "TechSupport": 0.021075514184494604,
    "StreamingTV": 0.002857839953611774,
    "StreamingMovies": 0.0026948387301600762,
    "PaperlessBilling": 0.027691665182286385,
    "MonthlyCharges": 0.030333913749778896,
    "TotalCharges": 0.02042979061295036,
    "InternetService_DSL": 0.011553763634235659,
    "InternetService_Fiber optic": 0.06897344266913896,
    "Contract_Month-to-month": 0.13256433122708655,
    "Contract_One year": 0.026059526721306625,
    "PaymentMethod_Bank transfer (automatic)": 0.010781510516920889,
    "PaymentMethod_Credit card (automatic)": 0.014187050

In [124]:
display(telco_data.iloc[487:494, :])
telco_data.to_csv('telco.csv')

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_DSL,InternetService_Fiber optic,Contract_Month-to-month,Contract_One year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check
487,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.7,0.713495,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
488,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.341294,0.15909,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
489,0.0,0.0,0.0,1.0,0.013889,1.0,1.0,0.0,0.0,0.0,...,0.558209,0.00641,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
490,0.0,0.0,1.0,1.0,0.875,1.0,1.0,0.0,1.0,1.0,...,0.861194,0.75911,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
491,1.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.0,1.0,0.0,...,0.405473,0.011003,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
492,1.0,0.0,1.0,0.0,0.027778,1.0,0.0,1.0,0.0,0.0,...,0.558706,0.013916,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
493,0.0,1.0,1.0,0.0,0.847222,1.0,1.0,0.0,1.0,0.0,...,0.455721,0.448165,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [125]:
print(telco_data.shape)


(7043, 24)
