In [43]:
# imports
import math
import random

from pandas import DataFrame
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from IPython.core.display import display
import json

In [44]:
# set random seed
random.seed(a=2)
np.random.seed(5)

In [45]:
# information gain function
def info_gain(df: DataFrame):
    all_features = list(df.columns)
    y_feature = all_features.pop(len(all_features) - 1)
    data = df.copy()
    y = data.pop(y_feature)
    x = data
    importances = mutual_info_classif(x, y)
    info_gain_map = {
        feature: gain for feature, gain in zip(all_features, importances)
    }

    info_gain_map = {k: v for k, v in sorted(info_gain_map.items(), key=lambda item: item[1], reverse=True)}
    return list(info_gain_map.keys())

In [46]:
# One-Hot encoding
def encode_and_bind(original_dataframe, feature_to_encode):
    return pd.get_dummies(
        original_dataframe, columns=feature_to_encode, drop_first=True
    )

In [47]:
# pre-processor 1
def read_telco_data():
    return pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                             converters={
                                 'gender': lambda x: int(x == 'Female'),
                                 'Partner': lambda x: int(x == 'Yes'),
                                 'Dependents': lambda x: int(x == 'Yes'),
                                 'PhoneService': lambda x: int(x =='Yes'),
                                 'PaperlessBilling': lambda x: int(x =='Yes'),
                                 'Churn': lambda x: int(x =='Yes'),
                             })

def process_telco_data(telco_data):
    telco_data.drop('customerID', axis=1, inplace=True)
    telco_data = telco_data.astype({
        'tenure': int,
        "MonthlyCharges": float,
        "TotalCharges": float
    }, errors="ignore")

    total_charges_median = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
    telco_data['TotalCharges'].replace([' '], total_charges_median, regex=True, inplace=True)

    columns_to_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                         'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                         'StreamingMovies', 'Contract', 'PaymentMethod']
    # for column in columns_to_encode:
    #     telco_data = encode_and_bind(telco_data, column)

    telco_data = encode_and_bind(telco_data, columns_to_encode)

    # Move final column for better visualization
    telco_data.insert(len(telco_data.columns)-1, 'Churn', telco_data.pop('Churn'))

    all_columns = list(telco_data.columns)
    telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

    return telco_data

def preprocess_telco_data():
    telco_data = read_telco_data()
    telco_data = process_telco_data(telco_data)
    telco_data.to_csv('telco.csv')
    return telco_data

In [48]:
# pre processor 2
def read_adult_data(file_name, pos):
    column_names = ['C'+str(i) for i in range(15)]
    return pd.read_csv(file_name,
                         names=column_names,
                         header=None,
                         sep=' *, * ',
                         engine="python",
                         converters={
                            'C9': lambda x: float(x == 'Male'),
                            'C14': lambda x: float(x == pos)
                         })

def process_adult_data(adult_data):
    all_columns = list(adult_data.columns)
    missing_value_columns = [
        column
        for column in all_columns
        if '?' in adult_data[column].values.tolist()
    ]

    for column in missing_value_columns:
        adult_data[column].replace(['?'], adult_data[column].mode(), inplace=True)

    columns_to_encode = ['C1', 'C3', 'C5', 'C6', 'C7', 'C8', 'C13']

    # for column in columns_to_encode:
    #     adult_data = encode_and_bind(adult_data, column)

    adult_data = encode_and_bind(adult_data, columns_to_encode)

    adult_data.insert(len(adult_data.columns)-1, 'C14', adult_data.pop('C14'))

    all_columns = list(adult_data.columns)
    adult_data[all_columns] = MinMaxScaler().fit_transform(adult_data[all_columns])
    # adult_data[all_columns] = StandardScaler().fit_transform(adult_data[all_columns])

    return adult_data

def preprocess_adult_data():
    adult_data = read_adult_data('adult.csv', '>50K')
    adult_test = read_adult_data('adult.test.csv', '>50K.')

    data_size = adult_data.shape[0]
    frames = [adult_data, adult_test]
    df = pd.concat(frames)

    df = process_adult_data(df)
    adult_data, adult_test = df.iloc[0:data_size, :], df.iloc[data_size: , :]

    adult_data.to_csv('adult-data.csv')
    adult_test.to_csv('adult-test.csv')

    return adult_data, adult_test

In [49]:
# pre processor 3
def read_cc_data():
    return pd.read_csv('creditcard.csv')

def process_cc_data(cc_data: DataFrame):
    positive_data = cc_data.loc[cc_data['Class'] == 1]
    negative_data = cc_data.loc[cc_data['Class'] == 0]

    negative_sub_data = negative_data.sample(n=20000, replace=False, random_state=5)

    frames = [positive_data, negative_sub_data]
    cc_data = pd.concat(frames)
    cc_data = cc_data.reset_index(drop=True)

    all_columns = list(cc_data.columns)
    cc_data[all_columns] = MinMaxScaler().fit_transform(cc_data[all_columns])

    return cc_data

def preprocess_cc_data():
    cc_data = read_cc_data()
    cc_data = process_cc_data(cc_data)
    cc_data.to_csv('cc.csv')
    return cc_data

In [50]:
# loss function
def loss(y_predicted, y_actual, size):
    return np.sum((y_actual - y_predicted) ** 2) / size

In [51]:
def statistics(y_predict, y_actual):
    matrix = confusion_matrix(y_actual, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    print('Accuracy is {}'.format(accuracy(y_predict, y_actual)))
    print('Sensitivity is {}'.format(tp/(tp+fn)))
    print('Specificity is {}'.format(tn/(tn+fp)))
    print('Precision is {}'.format(tp/(tp+fp)))
    print('False discovery rate {}'.format(fp/(fp+tp)))
    print('F1 score {}'.format(2*tp/(2*tp+fp+fn)))

In [52]:
# accuracy function
def accuracy(y_predicted, y_actual):
    return accuracy_score(y_actual, y_predicted)

In [53]:
# prediction function for determining label of hypothesis
def predict(hypothesis):
    labels = np.array([1.0 if it > 0.0 else -1.0 for it in hypothesis])
    labels = labels.reshape((labels.shape[0], 1))
    return labels

In [54]:
# logistic regression
def train(x, y, early_terminate_threshold=0.0, learning_rate=0.0001, no_of_iterations=10000):
    no_of_data, no_of_features = x.shape
    w = np.random.rand(no_of_features, 1)
    for i in range(no_of_iterations):
        # z = np.dot(x, w)
        z = x @ w
        h = np.tanh(z)
        # y_pred = predict(h)
        # error = 1 - accuracy(y_pred, y)
        error = loss(h, y, no_of_data)
        if error < early_terminate_threshold:
            break
        # gradient = np.dot(x.T, (y - h) * (1 - h ** 2))
        gradient = x.T @ ((y - h) * (1 - h ** 2))
        w += learning_rate * gradient / no_of_data

    return w

In [55]:
# resample function for adaboost
def resample(x, y, w):
    indices = np.random.choice(x.shape[0], x.shape[0], replace=True, p=w )
    x_data = x[indices]
    y_data = y[indices]
    return x_data, y_data

In [56]:
# Adaboost
def adaboost(example_x, example_y, k):
    no_of_data = example_x.shape[0]
    w = np.array([1/no_of_data] * no_of_data)
    h = []
    z = []
    for _ in range(k):
        x_data, y_data = resample(example_x, example_y, w)
        w_learn = train(x_data, y_data, early_terminate_threshold=0.8)
        h_k = np.tanh(np.dot(example_x, w_learn))
        h_k = predict(h_k)
        error = sum(w[j] for j in range(no_of_data) if h_k[j] != example_y[j])
        if error > 0.5:
            continue

        for j in range(no_of_data):
            if h_k[j] == example_y[j]:
                w[j] = w[j] * (error / (1-error))

        w /= np.sum(w)
        h.append(w_learn)
        # z.append(math.log((1-error)/error, 2))
        z.append(np.log((1-error)/error))

    return h, z


In [57]:
def logistic_regression_test(training_x, training_y, test_x, test_y, threshold, learning_rate=0.01, no_of_iterations = 10000):
    w_logi = train(training_x, training_y, early_terminate_threshold=threshold, learning_rate=learning_rate, no_of_iterations=no_of_iterations)
    h_logi = np.tanh(np.dot(test_x, w_logi))
    h_logi = predict(h_logi)

    h_train = np.tanh(np.dot(training_x, w_logi))
    h_train = predict(h_train)
    # print('Logistic regression accuracy {}.'.format(accuracy(h_logi, test_y)))
    # print('Test set stats = ')
    # statistics(y_predict=h_logi, y_actual=test_y)
    # print('Train set stats = ')
    # statistics(y_predict=h_train, y_actual=training_y)

In [58]:
def adaboost_test(training_x, training_y, test_x, test_y, k):
    h_ada, z_ada = adaboost(training_x, training_y, k)

    hypo_test = np.zeros(test_y.shape)
    hypo_train = np.zeros(training_y.shape)
    for _h, _z in zip(h_ada, z_ada):
        l_test = np.tanh(np.dot(test_x, _h))
        l_train = np.tanh(np.dot(training_x, _h))
        hypo_test += _z * l_test
        hypo_train += _z * l_train

    # hypo /= sum(z_ada)

    hypo_test = predict(hypo_test)
    hypo_train = predict(hypo_train)
    print('Adaboost accuracy for test set k = {} is {}.'.format(k, accuracy(hypo_test, test_y)))
    print('Adaboost accuracy for training set k = {} is {}.'.format(k, accuracy(hypo_train, training_y)))

In [59]:
# telco data
data = preprocess_telco_data()

In [60]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,1.0,0.0,1.0,0.0,0.013889,0.0,1.0,0.115423,0.001275,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.472222,1.0,0.0,0.385075,0.215867,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.354229,0.010310,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.625000,0.0,0.0,0.239303,0.210241,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.521891,0.015330,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.0,1.0,1.0,0.333333,1.0,1.0,0.662189,0.227521,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
7039,1.0,0.0,1.0,1.0,1.000000,1.0,1.0,0.845274,0.847461,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
7040,1.0,0.0,1.0,1.0,0.152778,0.0,1.0,0.112935,0.037809,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7041,0.0,1.0,1.0,0.0,0.055556,1.0,1.0,0.558706,0.033210,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [61]:
# Churn data full
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0 else -1.0 for it in data_y])
# data_y = data_y.reshape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)

In [62]:
final_column = data.columns[-1]
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=10)

columns = info_gain(train_dataset)
feature_cutoff = 10
columns_to_use = columns[0:feature_cutoff]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]

reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()

reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()

x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))

x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

['Contract_Two year', 'tenure', 'MonthlyCharges', 'InternetService_Fiber optic', 'DeviceProtection_No internet service', 'PaymentMethod_Electronic check', 'StreamingTV_No internet service', 'TotalCharges', 'StreamingMovies_No internet service', 'OnlineBackup_No internet service']


In [63]:
x_train

array([[1.        , 0.        , 0.79166667, ..., 0.3666282 , 0.        ,
        0.        ],
       [1.        , 0.        , 0.20833333, ..., 0.07989268, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01587237, 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.38888889, ..., 0.06280291, 1.        ,
        1.        ],
       [1.        , 0.        , 0.95833333, ..., 0.73427187, 0.        ,
        0.        ],
       [1.        , 0.        , 0.875     , ..., 0.47225363, 0.        ,
        0.        ]])

In [64]:
y_train

array([[-1.],
       [-1.],
       [ 1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [65]:
x_test

array([[1.        , 0.        , 0.43055556, ..., 0.28446804, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01623009, 0.        ,
        0.        ],
       [1.        , 0.        , 0.83333333, ..., 0.67484422, 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.61111111, ..., 0.47358643, 0.        ,
        0.        ],
       [1.        , 1.        , 0.98611111, ..., 0.92999077, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01851489, 0.        ,
        0.        ]])

In [66]:
y_test

array([[-1.],
       [ 1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [ 1.]])

In [67]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

In [68]:
for i in range(1, 5):
    adaboost_test(x_train, y_train, x_test, y_test, i*5)

Adaboost accuracy for test set k = 5 is 0.7565649396735273.
Adaboost accuracy for training set k = 5 is 0.729144479943202.
Adaboost accuracy for test set k = 10 is 0.7565649396735273.
Adaboost accuracy for training set k = 10 is 0.729144479943202.
Adaboost accuracy for test set k = 15 is 0.7565649396735273.
Adaboost accuracy for training set k = 15 is 0.729144479943202.
Adaboost accuracy for test set k = 20 is 0.7565649396735273.
Adaboost accuracy for training set k = 20 is 0.729144479943202.


In [69]:
# adult data
training_set, test_set = preprocess_adult_data()

In [70]:
training_set

Unnamed: 0,C0,C2,C4,C9,C10,C11,C12,C1_Local-gov,C1_Never-worked,C1_Private,...,C13_Puerto-Rico,C13_Scotland,C13_South,C13_Taiwan,C13_Thailand,C13_Trinadad&Tobago,C13_United-States,C13_Vietnam,C13_Yugoslavia,C14
0,0.301370,0.044131,0.800000,1.0,0.021740,0.0,0.397959,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.452055,0.048052,0.800000,1.0,0.000000,0.0,0.122449,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.287671,0.137581,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.493151,0.150486,0.400000,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.150685,0.220635,0.800000,0.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.165763,0.733333,0.0,0.000000,0.0,0.377551,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32557,0.315068,0.096129,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
32558,0.561644,0.094462,0.533333,0.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32559,0.068493,0.128004,0.533333,1.0,0.000000,0.0,0.193878,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [71]:
test_set

Unnamed: 0,C0,C2,C4,C9,C10,C11,C12,C1_Local-gov,C1_Never-worked,C1_Private,...,C13_Puerto-Rico,C13_Scotland,C13_South,C13_Taiwan,C13_Thailand,C13_Trinadad&Tobago,C13_United-States,C13_Vietnam,C13_Yugoslavia,C14
0,0.109589,0.145129,0.400000,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.287671,0.052451,0.533333,1.0,0.000000,0.0,0.500000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.150685,0.219649,0.733333,1.0,0.000000,0.0,0.397959,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.369863,0.100153,0.600000,1.0,0.076881,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.013699,0.061708,0.600000,0.0,0.000000,0.0,0.295918,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.301370,0.137428,0.800000,0.0,0.000000,0.0,0.357143,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16277,0.643836,0.209130,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16278,0.287671,0.245379,0.800000,1.0,0.000000,0.0,0.500000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16279,0.369863,0.048444,0.800000,1.0,0.054551,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [72]:
# training_set.insert(0, 'Ones', 1.0)
# training_set = training_set.to_numpy()
# #
# test_set.insert(0, 'Ones', 1.0)
# test_set = test_set.to_numpy()
#
# x_train = training_set[:, :-1]
# y_train = training_set[:, -1]
# #
# x_test = test_set[:, :-1]
# y_test = test_set[:, -1]
# #
# y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# y_train = y_train.reshape((y_train.shape[0], 1))
# #
# y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# y_test = y_test.reshape((y_test.shape[0], 1))

In [73]:
final_column = training_set.columns[-1]
#
columns = info_gain(training_set)
feature_cutoff = 35
columns_to_use = columns[0:feature_cutoff]
# columns_to_use = columns[-feature_cutoff:]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = training_set[columns_to_use], test_set[columns_to_use]
#
reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()
#
reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()
#
x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))
#
x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

['C5_Married-civ-spouse', 'C10', 'C0', 'C4', 'C5_Never-married', 'C7_Own-child', 'C12', 'C2', 'C11', 'C9', 'C7_Not-in-family', 'C6_Exec-managerial', 'C3_Bachelors', 'C3_Masters', 'C6_Other-service', 'C3_HS-grad', 'C1_Private', 'C6_Handlers-cleaners', 'C3_Prof-school', 'C7_Unmarried', 'C1_Self-emp-inc', 'C8_White', 'C6_Craft-repair', 'C6_Prof-specialty', 'C3_Doctorate', 'C6_Armed-Forces', 'C7_Wife', 'C13_Italy', 'C13_Mexico', 'C3_11th', 'C13_China', 'C8_Black', 'C13_Cuba', 'C13_Philippines', 'C1_Local-gov']


In [74]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

In [None]:
for i in range(1, 5):
    adaboost_test(x_train, y_train, x_test, y_test, i*5)

In [None]:
# credit card data
data = preprocess_cc_data()

In [None]:
data

In [None]:
# credit card data full
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0.0 else -1.0 for it in data_y])
# data_y = data_y.rescc_data()hape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=40)

In [None]:
final_column = data.columns[-1]
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=10)
#
columns = info_gain(train_dataset)
feature_cutoff = 10
columns_to_use = columns[0:feature_cutoff]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]
#
reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()
#
reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()
#
x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))
#
x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
# logistic_regression_test(x_train, y_train, x_test, y_test, 0.5, no_of_iterations=50000)

In [None]:
for i in range(1, 5):
    adaboost_test(x_train, y_train, x_test, y_test, i*5)