In [5]:
%load_ext autoreload
%autoreload 2

import numpy as np
import random
import pandas as pd
import sys 
import warnings
import costs
import gradient_descent
import stochastic_gradient_descent
import implementations
import metrics

sys.path.append("/home/souly/Desktop/ml/ml-project1")
from src import json_parser, helpers, preprocessing
from src.helpers import split_data_rand
from sklearn.metrics import f1_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
x_df = pd.read_csv("../data/raw/x_train.csv")
y_df = pd.read_csv("../data/raw/y_train.csv")

features = json_parser.parse_json_file("../data/raw/features.json")

def balance(y, x):
    positive_indices = np.where(y == 1)[0]
    negative_indices = np.where(y != 1)[0]

    min_samples = min(len(positive_indices), len(negative_indices))
    downsampled_negative_indices = random.sample(list(negative_indices), min_samples)
    balanced_indices = np.concatenate([positive_indices, downsampled_negative_indices])

    y = y[balanced_indices]
    x = x[balanced_indices]

    return y, x

def k_folds(y, x, k):
    fold_size = len(y) // k
    x_folds = []
    y_folds = []
    for i in range(k):
        start = i * fold_size
        end = (i+1) * fold_size if i < k + 1 else None
        y_fold = y[start:end]
        x_fold = x[start:end]
        y_folds.append(y_fold)
        x_folds.append(x_fold)

    return y_folds, x_folds

In [7]:
from abc import ABC, abstractmethod

class Model(ABC):
    def __init__(self, penalty, learning_rate, max_iter, initializer="none"):
        self.penalty = penalty
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.initializer = initializer

    @abstractmethod
    def fit(self, x, y):
        pass

    @abstractmethod
    def predict(self, x):
        pass

    @abstractmethod
    def score(self, x, y):
        pass

    def __repr__(self):
        s = self.__class__.__name__ + "("
        for attr, value in self.__dict__.items():
            if attr != "w":
                s += attr + "=" + str(value) + ", "
        s = s[:-2] + ")"
        return s

class LogisticRegression(Model):
    def fit(self, x, y):
        if self.initializer == "he":
            self.w = np.random.normal(0.0, 2 / x.shape[1], x.shape[1])
        elif self.initializer == "godot":
            self.w = np.random.normal(0.0, 2 / (x.shape[1] + 2), x.shape[1])
        else:
            self.w = np.ones(x.shape[1])
        self.w, loss = implementations.reg_logistic_regression(y, x, self.penalty, self.w, self.max_iter, self.learning_rate)
        return loss

    def predict(self, x):
        preds = costs.sigmoid(x.dot(self.w))
        preds = (preds >= 0.5).astype(int)
        return preds

    def score(self, x, y):
        return metrics.compute_accuracy(y, self.predict(x))

class LinearRegression(Model):
    def fit(self, x, y):
        self.w = np.ones(x.shape[1]) # TODO: try different initializer
        return implementations.mean_squared_error_sgd(y, x, self.penalty, self.w, self.max_iter, self.learning_rate)

    def predict(self, x):
        preds = self.w.T@x
        preds = (preds >= 0.5).as_type(int)
        return preds

In [8]:
# preprocessing
seed = 42
degree = 1
split_ratio = 0.8
k = 10
max_iter = 10000
learning_rate = 0.5

x = x_df.values
y = y_df["_MICHD"].values
y[y == -1] = 0

x = preprocessing.clean_data(features, x, median_estimator=False, do_poly=False, do_one_hot=True)

y_folds, x_folds = k_folds(y, x, k)

# grid search
for initializer in ["none", "he", "godot"]:
    for penalty in [0.001, 0.0001, 0.00001]:
        print("===========================================================")
        train_losses = []
        accuracies = []
        f1_scores = []
        model = LogisticRegression(penalty, learning_rate, max_iter, initializer)
        print(str(k) + "-folds: " + str(model))
        it = 1
        # k-folds
        for y_f, x_f in zip(y_folds, x_folds):
            split_index = int(len(x_f)*split_ratio)
            x_train = x_f[:split_index]
            y_train = y_f[:split_index]
            x_test = x_f[split_index:]
            y_test = y_f[split_index:]
            y_train, x_train = balance(y_train, x_train)
            train_loss = model.fit(x_train, y_train)
            acc = model.score(x_test, y_test)
            pred = model.predict(x_test)
            score = f1_score(y_test, pred)
            train_losses.append(train_loss)
            accuracies.append(acc)
            f1_scores.append(score)
            # print("| [" + str(it) + "/" + str(k) + "] folds: acc=" + str(round(acc, 3)) 
            #     + ", f1_score=" + str(round(score, 3)) 
            #     + ", train_loss=" + str(round(train_loss, 3)))
            # it += 1
        accuracies = np.array(accuracies)
        f1_scores = np.array(f1_scores)
        print("| ACCURACY: mean=" + str(round(np.mean(accuracies), 3)) + ", std=" + str(round(np.std(accuracies), 3)))
        print("| F1 SCORE: mean=" + str(round(np.mean(f1_scores), 3)) + ", std=" + str(round(np.std(f1_scores), 3)))

10-folds: LogisticRegression(penalty=0.001, learning_rate=0.5, max_iter=10000, initializer=none)
| ACCURACY: mean=0.738, std=0.041
| F1 SCORE: mean=0.357, std=0.016
10-folds: LogisticRegression(penalty=0.0001, learning_rate=0.5, max_iter=10000, initializer=none)
| ACCURACY: mean=0.743, std=0.04
| F1 SCORE: mean=0.359, std=0.017
10-folds: LogisticRegression(penalty=1e-05, learning_rate=0.5, max_iter=10000, initializer=none)
| ACCURACY: mean=0.766, std=0.027
| F1 SCORE: mean=0.369, std=0.023
10-folds: LogisticRegression(penalty=0.0, learning_rate=0.5, max_iter=10000, initializer=none)
| ACCURACY: mean=0.765, std=0.028
| F1 SCORE: mean=0.37, std=0.024
10-folds: LogisticRegression(penalty=0.001, learning_rate=0.1, max_iter=10000, initializer=none)
| ACCURACY: mean=0.755, std=0.006
| F1 SCORE: mean=0.365, std=0.013
10-folds: LogisticRegression(penalty=0.0001, learning_rate=0.1, max_iter=10000, initializer=none)
| ACCURACY: mean=0.756, std=0.005
| F1 SCORE: mean=0.366, std=0.015
10-folds: Lo