In [44]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import csv
import sys 

%load_ext autoreload
%autoreload 2

sys.path.append("../src/")
from sklearn.neighbors import KNeighborsClassifier
import helpers, preprocessing, exploration, polynomial_exp
from helpers import split_data_rand
from helpers import build_model_data
from json_parser import parse_json_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from preprocessing import clean_data
from preprocessing import balance_data
from preprocessing import undefined_to_median, undefined_to_avg, prune_undefined
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
simplefilter("ignore", category=ConvergenceWarning)

import os
# Class of different styles
class style():
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    UNDERLINE = '\033[4m'
    BOLD = '\033[1m'
    RESET = '\033[0m'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
x_df = pd.read_csv("../data/raw/x_train.csv")
y_df = pd.read_csv("../data/raw/y_train.csv")

In [46]:
def _prepare_data(x, y, ratio, size):
    if size == 0: 
        print("Training with : 90% of [-1] and with 10% of [1] {original dataset}")
        balanced_x = x
        balanced_y = y 
    else:
        balanced_x, balanced_y = balance_data(x, y, seed=seed, size=size)
    
    ratio = 0.2
    y_train, x_train, y_test, x_test = helpers.split_data_rand(balanced_y, balanced_x, ratio)

    features = parse_json_file("features.json")
    x_train = clean_data(features, x_train, do_poly=False)
    x_test = clean_data(features, x_test, do_poly=False)
    return y_train, x_train, y_test, x_test

def _print_result(model, y_test, x_test, weights):
    test_preds = model.predict(x_test, weights)
    test_preds[np.where(test_preds == 0)] = -1 # In case of logistic regression
    testing_accuracy = metrics.compute_accuracy(y_test, test_preds)
    f1_score = metrics.f1_score(y_test, test_preds)
    
    print(style.BOLD + f"Loss: {loss:.4f}" + style.RESET)
    print(style.BOLD + style.GREEN  + f"Test accuracy: {testing_accuracy:.4f}" + style.RESET)
    print(style.BOLD + style.YELLOW + f"F1-Score: {f1_score:.4f}" + style.RESET )

In [49]:
# preprocessing
import fitters
import metrics
seed = 100
degree = 1

x = x_df.values
y = y_df["_MICHD"].values

print(style.BLUE + "========================== Starting training for fitters ==========================" + style.RESET)

# Linear Regression GD
for size in [0,1,2,3]:
    print(style.BOLD + f"\nLinear Regression GD for dataset D_{size + 1}" + style.RESET)
    print("========================================================================".replace("=", "-"))
    ratio = 0.2
    y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
    gm = fitters.GradientFitter(y_train, x_train, y_test, x_test, 10000, 0.001)
    w, loss = gm.fit()
    _print_result(gm, y_test, x_test, w)
    print("========================================================================".replace("=", "-"))
    
# Linear Regression SGD
for size in [0,1,2,3]:
    print(style.BOLD + f"\nLinear Regression SGD for dataset D_{size + 1}" + style.RESET)
    print("========================================================================".replace("=", "-"))
    ratio = 0.2
    y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
    sgm = fitters.StochasticGradientFitter(y_train, x_train, y_test, x_test, 10000, 0.001)
    w, loss = sgm.fit()
    _print_result(sgm, y_test, x_test, w)
    print("========================================================================".replace("=", "-"))

# Least Square (Singular Matrix problem)
# for size in [0,1,2,3]:
#     print(style.BOLD + f"\nLinear Regression GD for dataset D_{size + 1}" + style.RESET)
#     print("========================================================================".replace("=", "-"))
#     ratio = 0.2
#     y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
#     lsq = fitters.LeastSquareFitter(y_train, x_train, y_test, x_test)
#     w, loss = lsq.fit()
#     _print_result(lsq, y_test, x_test, w)
#     print("========================================================================".replace("=", "-"))

# Ridge Regression 
for size in [0,1,2,3]:
    print(style.BOLD + f"\nRidge Regression for dataset D_{size + 1}" + style.RESET)
    print("========================================================================".replace("=", "-"))
    ratio = 0.2
    y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
    rrf = fitters.RidgeRegressionFitter(y_train, x_train, y_test, x_test, 10e-7)
    w, loss = rrf.fit()
    _print_result(rrf, y_test, x_test, w)
    print("========================================================================".replace("=", "-"))

# Logistic Regression
for size in [0,1,2,3]:
    print(style.BOLD + f"\nLogistic Regression GD for dataset D_{size + 1}" + style.RESET)
    print("========================================================================".replace("=", "-"))
    ratio = 0.2
    y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
    y_train[np.where(y_train == -1)] = 0
    lg = fitters.LogisticRegressionFitter(y_train, x_train, y_test, x_test, 10000, 0.005)
    w, loss = lg.fit()
    _print_result(lg, y_test, x_test, w)
    print("========================================================================".replace("=", "-"))

# Ridge Logistic Regression
for size in [0,1,2,3]:
    print(style.BOLD + f"\nRidge Logisitc Regression GD for dataset D_{size + 1}" + style.RESET)
    print("========================================================================".replace("=", "-"))
    ratio = 0.2
    y_train, x_train, y_test, x_test = _prepare_data(x, y, ratio, size)
    y_train[np.where(y_train == -1)] = 0
    rlg = fitters.RegLogisticRegressionFitter(y_train, x_train, y_test, x_test, 10000, 0.005, 10e-4)
    w, loss = rlg.fit()
    _print_result(rlg, y_test, x_test, w)
    print("========================================================================".replace("=", "-"))
    
print(style.BLUE + "===================================================================================" + style.RESET)

[1m
Linear Regression GD for dataset D_1[0m
------------------------------------------------------------------------
Training with : 90% of [-1] and with 10% of [1] {original dataset}
[1mLoss: 0.1503[0m
[1m[32mTest accuracy: 0.9099[0m
[1m[33mF1-Score: 0.1952[0m
------------------------------------------------------------------------
[1m
Linear Regression GD for dataset D_2[0m
------------------------------------------------------------------------
Training with : 50.00% of [-1] and with 50.00% of [1]
[1mLoss: 0.3284[0m
[1m[32mTest accuracy: 0.7628[0m
[1m[33mF1-Score: 0.7669[0m
------------------------------------------------------------------------
[1m
Linear Regression GD for dataset D_3[0m
------------------------------------------------------------------------
Training with : 66.67% of [-1] and with 33.33% of [1]
[1mLoss: 0.3119[0m
[1m[32mTest accuracy: 0.7677[0m
[1m[33mF1-Score: 0.6189[0m
----------------------------------------------------------------

In [48]:
if size==3:
        x_true = pd.read_csv("../data/raw/x_test.csv")
        ids = x_true.Id
        x_true = clean_data(features, x_true.values, do_poly=False)
        test_preds = rrf.predict(x_true, w)
        helpers.create_csv_submission(ids, test_preds, f"testRRF-{size}.csv")

NameError: name 'features' is not defined