In [29]:
# imports
import math

from pandas import DataFrame
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from IPython.core.display import display
import json

In [30]:
# information gain function
def info_gain(df: DataFrame):
    all_features = list(df.columns)
    y_feature = all_features.pop(len(all_features) - 1)
    data = df.copy()
    y = data.pop(y_feature)
    x = data
    importances = mutual_info_classif(x, y)
    info_gain_map = {}
    for feature, gain in zip(all_features, importances):
        info_gain_map[feature] = gain

    print(json.dumps(info_gain_map, indent=4))

In [31]:
# One-Hot encoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    dummies = dummies.iloc[:, :-1]
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res


In [32]:
# pre-processor 1
def read_data():
    telco_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                             converters={
                                 'gender': lambda x: int(x == 'Female'),
                                 'Partner': lambda x: int(x == 'Yes'),
                                 'Dependents': lambda x: int(x == 'Yes'),
                                 'PhoneService': lambda x: int(x =='Yes'),
                                 'PaperlessBilling': lambda x: int(x =='Yes'),
                                 'Churn': lambda x: int(x =='Yes'),
                             })

    return telco_data

def process_data(telco_data):
    telco_data.drop('customerID', axis=1, inplace=True)
    telco_data = telco_data.astype({
        'tenure': int,
        "MonthlyCharges": float,
        "TotalCharges": float
    }, errors="ignore")

    total_charges_median = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
    telco_data['TotalCharges'].replace([' '], total_charges_median, regex=True, inplace=True)

    columns_to_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                         'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                         'StreamingMovies', 'Contract', 'PaymentMethod']
    for column in columns_to_encode:
        telco_data = encode_and_bind(telco_data, column)

    # Move final column for better visualization
    telco_data.insert(len(telco_data.columns)-1, 'Churn', telco_data.pop('Churn'))

    all_columns = list(telco_data.columns)
    telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

    return telco_data

def preprocess():
    telco_data = read_data()
    telco_data = process_data(telco_data)
    telco_data.to_csv('telco.csv')
    return telco_data


In [33]:
# loss function
def loss(y_predicted, y_actual, size):
    error = 0.5 * np.sum((y_actual - y_predicted) ** 2) / size
    return error

In [34]:
# accuracy function
def accuracy(y_predicted, y_actual):
    return  np.sum(y_actual == y_predicted) / y_actual.shape[0]

In [35]:
# prediction function for determining label of hypothesis
def predict(hypothesis):
    labels = np.array([1.0 if it > 0 else -1.0 for it in hypothesis])
    labels = labels.reshape((labels.shape[0], 1))
    return labels

In [36]:
# logistic regression
def train(x, y, early_terminate_threshold=0.0, learning_rate=0.0001, no_of_iterations=5000):
    no_of_data, no_of_features = x.shape
    w = np.random.rand(no_of_features, 1)
    error = 0
    for i in range(no_of_iterations):
        z = np.dot(x, w)
        h = np.tanh(z)
        error = loss(h, y, no_of_data)
        if error < early_terminate_threshold:
            break
        gradient = np.dot(x.T, (y - h) * (1 - h ** 2))
        w += learning_rate * gradient

    return w

In [37]:
# resample function for adaboost
def resample(x, y, w):
    indices = np.random.choice(x.shape[0], x.shape[0], replace=True, p=w )
    x_data = x[indices]
    y_data = y[indices]
    return x_data, y_data

In [38]:
# Adaboost
def adaboost(example_x, example_y, k):
    no_of_data = example_x.shape[0]
    w = np.array([1/no_of_data] * no_of_data)
    h = []
    z = []
    for i in range(k):
        x_data, y_data = resample(example_x, example_y, w)
        w_learn = train(x_data, y_data, early_terminate_threshold=0.5)
        h_k = np.tanh(np.dot(example_x, w_learn))
        h_k = predict(h_k)
        error = 0
        for j in range(no_of_data):
            if h_k[j] != example_y[j]:
                error += w[j]

        if error > 0.5:
            continue

        for j in range(no_of_data):
            if h_k[j] == example_y[j]:
                w[j] = w[j] * (error / (1-error))

        w /= np.sum(w)
        h.append(w_learn)
        z.append(math.log((1-error)/error, 2))

    return h, z


In [45]:
data = preprocess()
data.insert(0, 'Ones', 1.0)
data = data.to_numpy()
rows, columns = data.shape

data_x = data[:, :-1]
data_y = data[:, -1]

data_y = np.array([1.0 if it > 0 else -1.0 for it in data_y])
data_y = data_y.reshape((data_y.shape[0], 1))

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)

In [46]:
# logistic regression test
w_logi = train(x_train, y_train, early_terminate_threshold=0.5)
h_logi = np.tanh(np.dot(x_test, w_logi))
h_logi = predict(h_logi)
print('Logistic regression accuracy {}.'.format(accuracy(h_logi, y_test)))

Logistic regression accuracy 0.7012065294535131.


In [47]:
# adaboost test
h_ada, z_ada = adaboost(x_train, y_train, 10)

hypo = np.zeros(y_test.shape)
for h, z in zip(h_ada, z_ada):
    l = np.tanh(np.dot(x_test, h))
    h_out = predict(l)
    hypo += z * l

hypo /= sum(z_ada)

h_out = predict(hypo)
print('Logistic regression accuracy {}.'.format(accuracy(h_out, y_test)))


Logistic regression accuracy 0.7963094393186657.
