In [1]:
# imports
import math
import random

from pandas import DataFrame
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from IPython.core.display import display
import json

In [2]:
# set random seed
random.seed(a=2)
np.random.seed(5)

In [3]:
# information gain function
def info_gain(df: DataFrame):
    all_features = list(df.columns)
    y_feature = all_features.pop(len(all_features) - 1)
    data = df.copy()
    y = data.pop(y_feature)
    x = data
    importances = mutual_info_classif(x, y)
    info_gain_map = {
        feature: gain for feature, gain in zip(all_features, importances)
    }

    info_gain_map = {k: v for k, v in sorted(info_gain_map.items(), key=lambda item: item[1], reverse=True)}
    return list(info_gain_map.keys())

In [4]:
# One-Hot encoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    dummies = dummies.iloc[:, :-1]
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res

In [5]:
# pre-processor 1
def read_telco_data():
    return pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                             converters={
                                 'gender': lambda x: int(x == 'Female'),
                                 'Partner': lambda x: int(x == 'Yes'),
                                 'Dependents': lambda x: int(x == 'Yes'),
                                 'PhoneService': lambda x: int(x =='Yes'),
                                 'PaperlessBilling': lambda x: int(x =='Yes'),
                                 'Churn': lambda x: int(x =='Yes'),
                             })

def process_telco_data(telco_data):
    telco_data.drop('customerID', axis=1, inplace=True)
    telco_data = telco_data.astype({
        'tenure': int,
        "MonthlyCharges": float,
        "TotalCharges": float
    }, errors="ignore")

    total_charges_median = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
    telco_data['TotalCharges'].replace([' '], total_charges_median, regex=True, inplace=True)

    columns_to_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                         'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                         'StreamingMovies', 'Contract', 'PaymentMethod']
    for column in columns_to_encode:
        telco_data = encode_and_bind(telco_data, column)

    # Move final column for better visualization
    telco_data.insert(len(telco_data.columns)-1, 'Churn', telco_data.pop('Churn'))

    all_columns = list(telco_data.columns)
    telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

    return telco_data

def preprocess_telco_data():
    telco_data = read_telco_data()
    telco_data = process_telco_data(telco_data)
    telco_data.to_csv('telco.csv')
    return telco_data

In [6]:
# pre processor 2
def read_adult_data(file_name):
    column_names = ['C'+str(i) for i in range(15)]
    return pd.read_csv(file_name,
                         names=column_names,
                         header=None,
                         sep=' *, * ',
                         engine="python",
                         converters={
                            'C9': lambda x: float(x == 'Male'),
                            'C14': lambda x: float(x == '>50K')
                         })

def process_adult_data(adult_data):
    all_columns = list(adult_data.columns)
    missing_value_columns = [
        column
        for column in all_columns
        if '?' in adult_data[column].values.tolist()
    ]

    for column in missing_value_columns:
        adult_data[column].replace(['?'], adult_data[column].mode(), inplace=True)

    columns_to_encode = ['C1', 'C3', 'C5', 'C6', 'C7', 'C8', 'C13']

    for column in columns_to_encode:
        adult_data = encode_and_bind(adult_data, column)

    adult_data.insert(len(adult_data.columns)-1, 'C14', adult_data.pop('C14'))

    all_columns = list(adult_data.columns)
    adult_data[all_columns] = MinMaxScaler().fit_transform(adult_data[all_columns])

    return adult_data

def preprocess_adult_data():
    adult_data = read_adult_data('adult.csv')
    adult_test = read_adult_data('adult.test.csv')

    data_size = adult_data.shape[0]
    frames = [adult_data, adult_test]
    df = pd.concat(frames)

    df = process_adult_data(df)
    adult_data, adult_test = df.iloc[0:data_size, :], df.iloc[data_size: , :]

    adult_data.to_csv('adult-data.csv')
    adult_test.to_csv('adult-test.csv')

    return adult_data, adult_test

In [7]:
# pre processor 3
def read_cc_data():
    return pd.read_csv('creditcard.csv')

def process_cc_data(cc_data: DataFrame):
    positive_data = cc_data.loc[cc_data['Class'] == 1]
    negative_data = cc_data.loc[cc_data['Class'] == 0]

    negative_sub_data = negative_data.sample(n=20000, replace=False, random_state=15)

    frames = [positive_data, negative_sub_data]
    cc_data = pd.concat(frames)
    # cc_data = cc_data.sample(frac=1).reset_index(drop=True)

    all_columns = list(cc_data.columns)
    cc_data[all_columns] = MinMaxScaler().fit_transform(cc_data[all_columns])

    return cc_data

def preprocess_cc_data():
    cc_data = read_cc_data()
    cc_data = process_cc_data(cc_data)
    cc_data.to_csv('cc.csv')
    return cc_data

In [8]:
# loss function
def loss(y_predicted, y_actual, size):
    return 0.5 * np.sum((y_actual - y_predicted) ** 2) / size

In [9]:
# accuracy function
def accuracy(y_predicted, y_actual):
    return accuracy_score(y_actual, y_predicted)

In [10]:
# prediction function for determining label of hypothesis
def predict(hypothesis):
    labels = np.array([1.0 if it > 0.0 else -1.0 for it in hypothesis])
    labels = labels.reshape((labels.shape[0], 1))
    return labels

In [11]:
# logistic regression
def train(x, y, early_terminate_threshold=0.0, learning_rate=0.0001, no_of_iterations=10000):
    no_of_data, no_of_features = x.shape
    w = np.random.rand(no_of_features, 1)
    for _ in range(no_of_iterations):
        # z = np.dot(x, w)
        z = x @ w
        h = np.tanh(z)
        # y_pred = predict(h)
        # error = 1 - accuracy(y_pred, y)
        error = loss(h, y, no_of_data)
        if error < early_terminate_threshold:
            break
        # gradient = np.dot(x.T, (y - h) * (1 - h ** 2))
        gradient = x.T @ ((y - h) * (1 - h ** 2))
        w += learning_rate * gradient

    return w

In [12]:
# resample function for adaboost
def resample(x, y, w):
    indices = np.random.choice(x.shape[0], x.shape[0], replace=True, p=w )
    x_data = x[indices]
    y_data = y[indices]
    return x_data, y_data

In [13]:
# Adaboost
def adaboost(example_x, example_y, k):
    no_of_data = example_x.shape[0]
    w = np.array([1/no_of_data] * no_of_data)
    h = []
    z = []
    for _ in range(k):
        x_data, y_data = resample(example_x, example_y, w)
        w_learn = train(x_data, y_data, early_terminate_threshold=0.5)
        h_k = np.tanh(np.dot(example_x, w_learn))
        h_k = predict(h_k)
        error = sum(w[j] for j in range(no_of_data) if h_k[j] != example_y[j])
        if error > 0.5:
            continue

        for j in range(no_of_data):
            if h_k[j] == example_y[j]:
                w[j] = w[j] * (error / (1-error))

        w /= np.sum(w)
        h.append(w_learn)
        # z.append(math.log((1-error)/error, 2))
        z.append(np.log((1-error)/error))

    return h, z


In [14]:
def logistic_regression_test(training_x, training_y, test_x, test_y, threshold, learning_rate=0.0001):
    w_logi = train(training_x, training_y, early_terminate_threshold=threshold, learning_rate=learning_rate)
    h_logi = np.tanh(np.dot(test_x, w_logi))
    h_logi = predict(h_logi)
    print('Logistic regression accuracy {}.'.format(accuracy(h_logi, test_y)))

In [15]:
def adaboost_test(training_x, training_y, test_x, test_y, k):
    h_ada, z_ada = adaboost(training_x, training_y, k)

    hypo = np.zeros(test_y.shape)
    for _h, _z in zip(h_ada, z_ada):
        l = np.tanh(np.dot(test_x, _h))
        hypo += _z * l

    # hypo /= sum(z_ada)

    h_out = predict(hypo)
    print('Adaboost accuracy for k = {} is {}.'.format(k, accuracy(h_out, test_y)))

In [16]:
data = preprocess_telco_data()

In [17]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingTV_No,StreamingTV_No internet service,StreamingMovies_No,StreamingMovies_No internet service,Contract_Month-to-month,Contract_One year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,Churn
0,1.0,0.0,1.0,0.0,0.013889,0.0,1.0,0.115423,0.001275,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.472222,1.0,0.0,0.385075,0.215867,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.354229,0.010310,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.625000,0.0,0.0,0.239303,0.210241,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.521891,0.015330,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.0,1.0,1.0,0.333333,1.0,1.0,0.662189,0.227521,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7039,1.0,0.0,1.0,1.0,1.000000,1.0,1.0,0.845274,0.847461,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,1.0,0.0,1.0,1.0,0.152778,0.0,1.0,0.112935,0.037809,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7041,0.0,1.0,1.0,0.0,0.055556,1.0,1.0,0.558706,0.033210,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [18]:
# Churn data full
data.insert(0, 'Ones', 1.0)
data = data.to_numpy()

data_x = data[:, :-1]
data_y = data[:, -1]

data_y = np.array([1.0 if it > 0 else -1.0 for it in data_y])
data_y = data_y.reshape((data_y.shape[0], 1))

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)

In [19]:
# data = preprocess_cc_data()
# final_column = data.columns[-1]
# train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=10)
#
# columns = info_gain(train_dataset)
# feature_cutoff = 70
# columns_to_use = columns[0:feature_cutoff]
# columns_to_use.append(final_column)
# reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]
#
# reduced_training.insert(0, 'Ones', 1.0)
# reduced_training = reduced_training.to_numpy()
#
# reduced_test.insert(0, 'Ones', 1.0)
# reduced_test = reduced_test.to_numpy()
#
# x_train = reduced_training[:, :-1]
# y_train = reduced_training[:, -1]
# y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# y_train = y_train.reshape((y_train.shape[0], 1))
#
# x_test = reduced_test[:, :-1]
# y_test = reduced_test[:, -1]
# y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# y_test = y_test.reshape((y_test.shape[0], 1))

In [20]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

Logistic regression accuracy 0.6763662171753017.


In [21]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)

Adaboost accuracy for k = 5 is 0.7750177430801988.
Adaboost accuracy for k = 10 is 0.7892122072391767.
Adaboost accuracy for k = 15 is 0.7863733144073811.



KeyboardInterrupt



In [None]:
training_set, test_set = preprocess_adult_data()

In [None]:
training_set

In [None]:
test_set

In [None]:
# training_set.insert(0, 'Ones', 1.0)
# training_set = training_set.to_numpy()
# #
# test_set.insert(0, 'Ones', 1.0)
# test_set = test_set.to_numpy()
#
# x_train = training_set[:, :-1]
# y_train = training_set[:, -1]
# #
# x_test = test_set[:, :-1]
# y_test = test_set[:, -1]
# #
# y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# y_train = y_train.reshape((y_train.shape[0], 1))
# #
# y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# y_test = y_test.reshape((y_test.shape[0], 1))

In [None]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

In [None]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)

In [None]:
data = preprocess_cc_data()

In [None]:
data

In [None]:
# Churn data full
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0.0 else -1.0 for it in data_y])
# data_y = data_y.reshape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)

In [None]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5, learning_rate=0.0001)

In [None]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)

In [None]:
# data = preprocess_telco_data()
# # data = preprocess_cc_data()
# # final_column = data.columns[-1]
# # train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=10)
# #
# # columns = info_gain(train_dataset)
# # feature_cutoff = 70
# # columns_to_use = columns[0:feature_cutoff]
# # columns_to_use.append(final_column)
# # reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]
# #
# # reduced_training.insert(0, 'Ones', 1.0)
# # reduced_training = reduced_training.to_numpy()
# #
# # reduced_test.insert(0, 'Ones', 1.0)
# # reduced_test = reduced_test.to_numpy()
# #
# # x_train = reduced_training[:, :-1]
# # y_train = reduced_training[:, -1]
# # y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# # y_train = y_train.reshape((y_train.shape[0], 1))
# #
# # x_test = reduced_test[:, :-1]
# # y_test = reduced_test[:, -1]
# # y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# # y_test = y_test.reshape((y_test.shape[0], 1))
#
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
# print(data.shape)
# # rows, columns = data.shape
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0 else -1.0 for it in data_y])
# data_y = data_y.reshape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)
# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

In [None]:
# preprocess_adult_data()

In [None]:
# training_set, test_set = preprocess_adult_data()

In [None]:
# training_set

In [None]:
# test_set

In [None]:
# training_set.insert(0, 'Ones', 1.0)
# training_set = training_set.to_numpy()
#
# test_set.insert(0, 'Ones', 1.0)
# test_set = test_set.to_numpy()
# # # # rows, columns = training_set.shape
# # #
# x_train = training_set[:, :-1]
# y_train = training_set[:, -1]
#
# x_test = test_set[:, :-1]
# y_test = test_set[:, -1]
#
# y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# y_train = y_train.reshape((y_train.shape[0], 1))
#
# y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# y_test = y_test.reshape((y_test.shape[0], 1))


In [None]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

In [None]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)