In [69]:
# imports
import math
import random

from pandas import DataFrame
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.core.display import display
import json

In [70]:
# set random seed
random.seed(a=2)
np.random.seed(5)

In [71]:
# information gain function
def info_gain(df: DataFrame):
    all_features = list(df.columns)
    y_feature = all_features.pop(len(all_features) - 1)
    data = df.copy()
    y = data.pop(y_feature)
    x = data
    importances = mutual_info_classif(x, y)
    info_gain_map = {
        feature: gain for feature, gain in zip(all_features, importances)
    }

    info_gain_map = {k: v for k, v in sorted(info_gain_map.items(), key=lambda item: item[1], reverse=True)}
    return list(info_gain_map.keys())

In [72]:
# One-Hot encoding
def encode_and_bind(original_dataframe, feature_to_encode):
    return pd.get_dummies(
        original_dataframe, columns=feature_to_encode, drop_first=True
    )

In [73]:
# pre-processor 1
def read_telco_data():
    return pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                             converters={
                                 'gender': lambda x: int(x == 'Female'),
                                 'Partner': lambda x: int(x == 'Yes'),
                                 'Dependents': lambda x: int(x == 'Yes'),
                                 'PhoneService': lambda x: int(x =='Yes'),
                                 'PaperlessBilling': lambda x: int(x =='Yes'),
                                 'Churn': lambda x: int(x =='Yes'),
                             })

def process_telco_data(telco_data):
    telco_data.drop('customerID', axis=1, inplace=True)
    telco_data = telco_data.astype({
        'tenure': int,
        "MonthlyCharges": float,
        "TotalCharges": float
    }, errors="ignore")

    total_charges_median = (telco_data['TotalCharges'].loc[telco_data['TotalCharges'] != ' ']).median()
    telco_data['TotalCharges'].replace([' '], total_charges_median, regex=True, inplace=True)

    columns_to_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                         'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                         'StreamingMovies', 'Contract', 'PaymentMethod']
    # for column in columns_to_encode:
    #     telco_data = encode_and_bind(telco_data, column)

    telco_data = encode_and_bind(telco_data, columns_to_encode)

    # Move final column for better visualization
    telco_data.insert(len(telco_data.columns)-1, 'Churn', telco_data.pop('Churn'))

    all_columns = list(telco_data.columns)
    telco_data[all_columns] = MinMaxScaler().fit_transform(telco_data[all_columns])

    return telco_data

def preprocess_telco_data():
    telco_data = read_telco_data()
    telco_data = process_telco_data(telco_data)
    telco_data.to_csv('telco.csv')
    return telco_data

In [74]:
# pre processor 2
def read_adult_data(file_name, pos):
    column_names = ['C'+str(i) for i in range(15)]
    return pd.read_csv(file_name,
                         names=column_names,
                         header=None,
                         sep=' *, * ',
                         engine="python",
                         converters={
                            'C9': lambda x: float(x == 'Male'),
                            'C14': lambda x: float(x == pos)
                         })

def process_adult_data(adult_data):
    all_columns = list(adult_data.columns)
    missing_value_columns = [
        column
        for column in all_columns
        if '?' in adult_data[column].values.tolist()
    ]

    for column in missing_value_columns:
        adult_data[column].replace(['?'], adult_data[column].mode(), inplace=True)

    columns_to_encode = ['C1', 'C3', 'C5', 'C6', 'C7', 'C8', 'C13']

    # for column in columns_to_encode:
    #     adult_data = encode_and_bind(adult_data, column)

    adult_data = encode_and_bind(adult_data, columns_to_encode)

    adult_data.insert(len(adult_data.columns)-1, 'C14', adult_data.pop('C14'))

    all_columns = list(adult_data.columns)
    adult_data[all_columns] = MinMaxScaler().fit_transform(adult_data[all_columns])
    # adult_data[all_columns] = StandardScaler().fit_transform(adult_data[all_columns])

    return adult_data

def preprocess_adult_data():
    adult_data = read_adult_data('adult.csv', '>50K')
    adult_test = read_adult_data('adult.test.csv', '>50K.')

    data_size = adult_data.shape[0]
    frames = [adult_data, adult_test]
    df = pd.concat(frames)

    df = process_adult_data(df)
    adult_data, adult_test = df.iloc[0:data_size, :], df.iloc[data_size: , :]

    adult_data.to_csv('adult-data.csv')
    adult_test.to_csv('adult-test.csv')

    return adult_data, adult_test

In [75]:
# pre processor 3
def read_cc_data():
    return pd.read_csv('creditcard.csv')

def process_cc_data(cc_data: DataFrame):
    positive_data = cc_data.loc[cc_data['Class'] == 1]
    negative_data = cc_data.loc[cc_data['Class'] == 0]

    negative_sub_data = negative_data.sample(n=20000, replace=False, random_state=5)

    frames = [positive_data, negative_sub_data]
    cc_data = pd.concat(frames)
    cc_data = cc_data.reset_index(drop=True)

    all_columns = list(cc_data.columns)
    cc_data[all_columns] = MinMaxScaler().fit_transform(cc_data[all_columns])

    return cc_data

def preprocess_cc_data():
    cc_data = read_cc_data()
    cc_data = process_cc_data(cc_data)
    cc_data.to_csv('cc.csv')
    return cc_data

In [76]:
# loss function
def loss(y_predicted, y_actual, size):
    return mean_squared_error(y_actual, y_predicted)
    # return np.sum((y_actual - y_predicted) ** 2) / size

In [77]:
def statistics(y_predict, y_actual):
    matrix = confusion_matrix(y_actual, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    print('Accuracy is {}'.format(accuracy(y_predict, y_actual)))
    print('Sensitivity is {}'.format(tp/(tp+fn)))
    print('Specificity is {}'.format(tn/(tn+fp)))
    print('Precision is {}'.format(tp/(tp+fp)))
    print('False discovery rate {}'.format(fp/(fp+tp)))
    print('F1 score {}'.format(2*tp/(2*tp+fp+fn)))

In [78]:
# accuracy function
def accuracy(y_predicted, y_actual):
    return accuracy_score(y_actual, y_predicted)

In [79]:
# prediction function for determining label of hypothesis
def predict(hypothesis):
    labels = np.array([1.0 if it > 0.0 else -1.0 for it in hypothesis])
    labels = labels.reshape((labels.shape[0], 1))
    return labels

In [80]:
# logistic regression
def train(x, y, early_terminate_threshold=0.0, learning_rate=0.0001, no_of_iterations=10000):
    no_of_data, no_of_features = x.shape
    w = np.random.rand(no_of_features, 1)
    # w = np.zeros((no_of_features, 1))
    for _ in range(no_of_iterations):
        z = x @ w
        h = np.tanh(z)
        error = loss(h, y, no_of_data)
        if error < early_terminate_threshold:
            break
        gradient = x.T @ ((y - h) * (1 - h ** 2))
        w += learning_rate * gradient / no_of_data

    return w

In [81]:
# resample function for adaboost
def resample(x, y, w):
    indices = np.random.choice(x.shape[0], x.shape[0], replace=True, p=w )
    x_data = x[indices]
    y_data = y[indices]
    return x_data, y_data

In [82]:
# Adaboost
def adaboost(example_x, example_y, k):
    no_of_data = example_x.shape[0]
    w = np.array([1/no_of_data] * no_of_data)
    h = []
    z = []
    for _ in range(k):
        x_data, y_data = resample(example_x, example_y, w)
        w_learn = train(x_data, y_data, early_terminate_threshold=0.8, learning_rate=0.01, no_of_iterations=10000)
        h_k = np.tanh(np.dot(example_x, w_learn))
        h_k = predict(h_k)
        error = sum(w[j] for j in range(no_of_data) if h_k[j] != example_y[j])
        if error > 0.5:
            continue

        for j in range(no_of_data):
            if h_k[j] == example_y[j]:
                w[j] = w[j] * (error / (1-error))

        w /= np.sum(w)
        h.append(w_learn)
        # z.append(math.log((1-error)/error, 2))
        z.append(np.log((1-error)/error))

    return h, z


In [83]:
def logistic_regression_test(training_x, training_y, test_x, test_y, threshold, learning_rate=0.01, no_of_iterations = 10000):
    w_logi = train(training_x, training_y, early_terminate_threshold=threshold, learning_rate=learning_rate, no_of_iterations=no_of_iterations)
    h_logi = np.tanh(np.dot(test_x, w_logi))
    h_logi = predict(h_logi)

    h_train = np.tanh(np.dot(training_x, w_logi))
    h_train = predict(h_train)
    # print('Logistic regression accuracy {}.'.format(accuracy(h_logi, test_y)))
    print('Test set stats = ')
    statistics(y_predict=h_logi, y_actual=test_y)
    print('Train set stats = ')
    statistics(y_predict=h_train, y_actual=training_y)

In [84]:
def adaboost_test(training_x, training_y, test_x, test_y, k):
    h_ada, z_ada = adaboost(training_x, training_y, k)

    hypo_test = np.zeros(test_y.shape)
    hypo_train = np.zeros(training_y.shape)
    for _h, _z in zip(h_ada, z_ada):
        l_test = np.tanh(np.dot(test_x, _h))
        l_train = np.tanh(np.dot(training_x, _h))
        hypo_test += _z * l_test
        hypo_train += _z * l_train

    # hypo /= sum(z_ada)

    hypo_test = predict(hypo_test)
    hypo_train = predict(hypo_train)
    print('Adaboost accuracy for test set k = {} is {}.'.format(k, accuracy(hypo_test, test_y)))
    print('Adaboost accuracy for training set k = {} is {}.'.format(k, accuracy(hypo_train, training_y)))

In [85]:
# telco data
data = preprocess_telco_data()

In [86]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,1.0,0.0,1.0,0.0,0.013889,0.0,1.0,0.115423,0.001275,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.472222,1.0,0.0,0.385075,0.215867,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.354229,0.010310,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.625000,0.0,0.0,0.239303,0.210241,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.027778,1.0,1.0,0.521891,0.015330,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.0,1.0,1.0,0.333333,1.0,1.0,0.662189,0.227521,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
7039,1.0,0.0,1.0,1.0,1.000000,1.0,1.0,0.845274,0.847461,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
7040,1.0,0.0,1.0,1.0,0.152778,0.0,1.0,0.112935,0.037809,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7041,0.0,1.0,1.0,0.0,0.055556,1.0,1.0,0.558706,0.033210,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [87]:
# Churn data full
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0 else -1.0 for it in data_y])
# data_y = data_y.reshape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=10)

In [88]:
final_column = data.columns[-1]
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=10)

columns = info_gain(train_dataset)
feature_cutoff = 10
columns_to_use = columns[0:feature_cutoff]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]

reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()

reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()

x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))

x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

['Contract_Two year', 'tenure', 'MonthlyCharges', 'InternetService_Fiber optic', 'DeviceProtection_No internet service', 'PaymentMethod_Electronic check', 'StreamingTV_No internet service', 'TotalCharges', 'StreamingMovies_No internet service', 'OnlineBackup_No internet service']


In [89]:
x_train

array([[1.        , 0.        , 0.79166667, ..., 0.3666282 , 0.        ,
        0.        ],
       [1.        , 0.        , 0.20833333, ..., 0.07989268, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01587237, 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.38888889, ..., 0.06280291, 1.        ,
        1.        ],
       [1.        , 0.        , 0.95833333, ..., 0.73427187, 0.        ,
        0.        ],
       [1.        , 0.        , 0.875     , ..., 0.47225363, 0.        ,
        0.        ]])

In [90]:
y_train

array([[-1.],
       [-1.],
       [ 1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [91]:
x_test

array([[1.        , 0.        , 0.43055556, ..., 0.28446804, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01623009, 0.        ,
        0.        ],
       [1.        , 0.        , 0.83333333, ..., 0.67484422, 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.61111111, ..., 0.47358643, 0.        ,
        0.        ],
       [1.        , 1.        , 0.98611111, ..., 0.92999077, 0.        ,
        0.        ],
       [1.        , 0.        , 0.02777778, ..., 0.01851489, 0.        ,
        0.        ]])

In [92]:
y_test

array([[-1.],
       [ 1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [ 1.]])

In [93]:
logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

Test set stats = 
Accuracy is 0.7913413768630234
Sensitivity is 0.46355685131195334
Specificity is 0.8968105065666041
Precision is 0.5910780669144982
False discovery rate 0.40892193308550184
F1 score 0.5196078431372549
Train set stats = 
Accuracy is 0.7971246006389776
Sensitivity is 0.49344692005242463
Specificity is 0.9099318403115871
Precision is 0.6705253784505788
False discovery rate 0.3294746215494212
F1 score 0.5685164212910532


In [94]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)

In [95]:
# adult data
training_set, test_set = preprocess_adult_data()

In [96]:
training_set

Unnamed: 0,C0,C2,C4,C9,C10,C11,C12,C1_Local-gov,C1_Never-worked,C1_Private,...,C13_Puerto-Rico,C13_Scotland,C13_South,C13_Taiwan,C13_Thailand,C13_Trinadad&Tobago,C13_United-States,C13_Vietnam,C13_Yugoslavia,C14
0,0.301370,0.044131,0.800000,1.0,0.021740,0.0,0.397959,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.452055,0.048052,0.800000,1.0,0.000000,0.0,0.122449,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.287671,0.137581,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.493151,0.150486,0.400000,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.150685,0.220635,0.800000,0.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.165763,0.733333,0.0,0.000000,0.0,0.377551,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32557,0.315068,0.096129,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
32558,0.561644,0.094462,0.533333,0.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32559,0.068493,0.128004,0.533333,1.0,0.000000,0.0,0.193878,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [97]:
test_set

Unnamed: 0,C0,C2,C4,C9,C10,C11,C12,C1_Local-gov,C1_Never-worked,C1_Private,...,C13_Puerto-Rico,C13_Scotland,C13_South,C13_Taiwan,C13_Thailand,C13_Trinadad&Tobago,C13_United-States,C13_Vietnam,C13_Yugoslavia,C14
0,0.109589,0.145129,0.400000,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.287671,0.052451,0.533333,1.0,0.000000,0.0,0.500000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.150685,0.219649,0.733333,1.0,0.000000,0.0,0.397959,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.369863,0.100153,0.600000,1.0,0.076881,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.013699,0.061708,0.600000,0.0,0.000000,0.0,0.295918,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.301370,0.137428,0.800000,0.0,0.000000,0.0,0.357143,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16277,0.643836,0.209130,0.533333,1.0,0.000000,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16278,0.287671,0.245379,0.800000,1.0,0.000000,0.0,0.500000,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16279,0.369863,0.048444,0.800000,1.0,0.054551,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [98]:
# training_set.insert(0, 'Ones', 1.0)
# training_set = training_set.to_numpy()
# #
# test_set.insert(0, 'Ones', 1.0)
# test_set = test_set.to_numpy()
#
# x_train = training_set[:, :-1]
# y_train = training_set[:, -1]
# #
# x_test = test_set[:, :-1]
# y_test = test_set[:, -1]
# #
# y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
# y_train = y_train.reshape((y_train.shape[0], 1))
# #
# y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
# y_test = y_test.reshape((y_test.shape[0], 1))

In [99]:
final_column = training_set.columns[-1]
#
columns = info_gain(training_set)
feature_cutoff = 35
columns_to_use = columns[0:feature_cutoff]
# columns_to_use = columns[-feature_cutoff:]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = training_set[columns_to_use], test_set[columns_to_use]
#
reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()
#
reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()
#
x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))
#
x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

['C5_Married-civ-spouse', 'C10', 'C0', 'C5_Never-married', 'C4', 'C12', 'C7_Own-child', 'C11', 'C2', 'C9', 'C6_Exec-managerial', 'C7_Not-in-family', 'C6_Other-service', 'C3_Bachelors', 'C7_Unmarried', 'C1_Private', 'C3_Masters', 'C3_HS-grad', 'C6_Prof-specialty', 'C3_Prof-school', 'C3_Some-college', 'C7_Other-relative', 'C1_Self-emp-inc', 'C3_Doctorate', 'C13_Philippines', 'C7_Wife', 'C6_Machine-op-inspct', 'C8_White', 'C8_Black', 'C5_Separated', 'C1_Without-pay', 'C13_United-States', 'C1_Never-worked', 'C3_11th', 'C13_Haiti']


In [100]:
logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

Test set stats = 
Accuracy is 0.8256863829003133
Sensitivity is 0.4877795111804472
Specificity is 0.9301970245275433
Precision is 0.6836734693877551
False discovery rate 0.3163265306122449
F1 score 0.5693474962063733
Train set stats = 
Accuracy is 0.8229784097540002
Sensitivity is 0.4865450835352634
Specificity is 0.9296925566343042
Precision is 0.6870160273725914
False discovery rate 0.3129839726274086
F1 score 0.5696580558459011


In [101]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)

In [102]:
# credit card data
data = preprocess_cc_data()

In [103]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.002194,0.871937,0.756360,0.839488,0.422320,0.453419,0.534553,0.603411,0.695084,0.447977,...,0.466293,0.529878,0.733354,0.480991,0.643781,0.485812,0.603256,0.452491,0.000000,1.0
1,0.002576,0.852156,0.694446,0.916291,0.345643,0.488243,0.544933,0.645526,0.671179,0.552962,...,0.469182,0.556222,0.770653,0.387023,0.661109,0.429380,0.566877,0.462274,0.027975,1.0
2,0.025675,0.872177,0.754024,0.875070,0.347509,0.447878,0.573315,0.649008,0.665752,0.554336,...,0.450065,0.479639,0.746278,0.418622,0.629004,0.360017,0.587571,0.451958,0.012688,1.0
3,0.040287,0.815522,0.749167,0.811509,0.363190,0.442206,0.526518,0.589307,0.668215,0.553936,...,0.467419,0.541748,0.733942,0.423799,0.659091,0.339962,0.526220,0.506743,0.003120,1.0
4,0.043373,0.967860,0.769299,0.762787,0.455287,0.530144,0.536527,0.665942,0.664159,0.510454,...,0.448367,0.492416,0.729473,0.182124,0.750159,0.553725,0.584062,0.468341,0.000053,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20487,0.798935,0.927634,0.730324,0.914873,0.225182,0.472249,0.584673,0.634569,0.666807,0.518004,...,0.459547,0.581803,0.733516,0.455425,0.619810,0.496542,0.574513,0.456000,0.000550,0.0
20488,0.193927,0.898397,0.729857,0.949303,0.171346,0.468805,0.542631,0.636428,0.655356,0.509896,...,0.470662,0.518548,0.739133,0.523872,0.653729,0.364061,0.586522,0.449709,0.000262,0.0
20489,0.701439,0.946815,0.735143,0.847252,0.237145,0.477708,0.557522,0.655362,0.652972,0.550783,...,0.477411,0.558280,0.733881,0.369701,0.684271,0.444997,0.599570,0.477437,0.008022,0.0
20490,0.393273,0.926169,0.746261,0.913372,0.288489,0.476818,0.560492,0.655068,0.669626,0.519729,...,0.459162,0.563336,0.738276,0.449967,0.638986,0.420267,0.595810,0.467869,0.000095,0.0


In [104]:
# credit card data full
# data.insert(0, 'Ones', 1.0)
# data = data.to_numpy()
#
# data_x = data[:, :-1]
# data_y = data[:, -1]
#
# data_y = np.array([1.0 if it > 0.0 else -1.0 for it in data_y])
# data_y = data_y.rescc_data()hape((data_y.shape[0], 1))
#
# x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=40)

In [105]:
final_column = data.columns[-1]
train_dataset, test_dataset = train_test_split(data, test_size=0.2, random_state=30)
#
columns = info_gain(train_dataset)
feature_cutoff = 10
columns_to_use = columns[0:feature_cutoff]
print(columns_to_use)
columns_to_use.append(final_column)
reduced_training, reduced_test = train_dataset[columns_to_use], test_dataset[columns_to_use]
#
reduced_training.insert(0, 'Ones', 1.0)
reduced_training = reduced_training.to_numpy()
#
reduced_test.insert(0, 'Ones', 1.0)
reduced_test = reduced_test.to_numpy()
#
x_train = reduced_training[:, :-1]
y_train = reduced_training[:, -1]
y_train = np.array([1.0 if it > 0 else -1.0 for it in y_train])
y_train = y_train.reshape((y_train.shape[0], 1))
#
x_test = reduced_test[:, :-1]
y_test = reduced_test[:, -1]
y_test = np.array([1.0 if it > 0 else -1.0 for it in y_test])
y_test = y_test.reshape((y_test.shape[0], 1))

['V14', 'V17', 'V10', 'V12', 'V11', 'V16', 'V3', 'V4', 'V7', 'V9']


In [106]:
x_train

array([[1.        , 0.85141115, 0.73865241, ..., 0.25853652, 0.64865905,
        0.62293793],
       [1.        , 0.74494633, 0.74734691, ..., 0.19748887, 0.62857786,
        0.58729085],
       [1.        , 0.74920134, 0.73046339, ..., 0.435941  , 0.63792163,
        0.52134029],
       ...,
       [1.        , 0.74935534, 0.76705669, ..., 0.45950369, 0.62579889,
        0.61674907],
       [1.        , 0.71289129, 0.71578183, ..., 0.20105591, 0.65498373,
        0.6494853 ],
       [1.        , 0.73139879, 0.74917884, ..., 0.26207634, 0.64745804,
        0.57883631]])

In [107]:
x_test

array([[1.        , 0.75559645, 0.7328373 , ..., 0.20112378, 0.6419563 ,
        0.49745919],
       [1.        , 0.76335439, 0.71089347, ..., 0.22349298, 0.62621975,
        0.56577705],
       [1.        , 0.80908371, 0.75745553, ..., 0.27701799, 0.63447747,
        0.53333682],
       ...,
       [1.        , 0.76880588, 0.72784652, ..., 0.27161562, 0.63939036,
        0.56189713],
       [1.        , 0.79320558, 0.72362235, ..., 0.2461257 , 0.65301425,
        0.57160089],
       [1.        , 0.76607747, 0.73228424, ..., 0.13552665, 0.64198369,
        0.49757968]])

In [108]:
y_train

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [109]:
y_test

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [110]:
logistic_regression_test(x_train, y_train, x_test, y_test, 0.5)

Test set stats = 
Accuracy is 0.9751158819224202
Sensitivity is 0.0
Specificity is 1.0
Precision is nan
False discovery rate nan
F1 score 0.0
Train set stats = 
Accuracy is 0.9759653510644788
Sensitivity is 0.0
Specificity is 0.9997500468662126
Precision is 0.0
False discovery rate 1.0
F1 score 0.0


  print('Precision is {}'.format(tp/(tp+fp)))
  print('False discovery rate {}'.format(fp/(fp+tp)))


In [111]:
# for i in range(1, 5):
#     adaboost_test(x_train, y_train, x_test, y_test, i*5)