In [39]:
%load_ext autoreload
%autoreload 2

import numpy as np
import random
import pandas as pd
import sys 
import warnings

sys.path.append("/home/souly/Desktop/ml/ml-project1")
from src import json_parser, helpers, preprocessing
from src.helpers import split_data_rand
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from src.preprocessing import undefined_to_median, undefined_to_avg, prune_undefined

from sklearn.exceptions import ConvergenceWarning

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
x_df = pd.read_csv("../data/raw/x_train.csv")
y_df = pd.read_csv("../data/raw/y_train.csv")

features = json_parser.parse_json_file("../data/raw/features.json")

def balance(y, x):
    positive_indices = np.where(y == 1)[0]
    negative_indices = np.where(y != 1)[0]

    min_samples = min(len(positive_indices), len(negative_indices))
    downsampled_negative_indices = random.sample(list(negative_indices), min_samples)
    balanced_indices = np.concatenate([positive_indices, downsampled_negative_indices])

    y = y[balanced_indices]
    x = x[balanced_indices]

    return y, x

def k_folds(y, x, k):
    fold_size = len(y) // k
    x_folds = []
    y_folds = []
    for i in range(k):
        start = i * fold_size
        end = (i+1) * fold_size if i < k + 1 else None
        y_fold = y[start:end]
        x_fold = x[start:end]
        y_folds.append(y_fold)
        x_folds.append(x_fold)

    return y_folds, x_folds

In [41]:
# preprocessing
seed = 42
degree = 1
split_ratio = 0.8

x = x_df.values
y = y_df["_MICHD"].values

x = preprocessing.clean_data(features, x, median_estimator=True, do_poly=False, do_one_hot=True)

y_folds, x_folds = k_folds(y, x, 10)

accuracies = []
f1_scores = []

print("Running K-folds on LogisticRegression")
print("K=" + str(k))

it = 1

for y_f, x_f in zip(y_folds, x_folds):
    split_index = int(len(x_f)*split_ratio)
    
    x_train = x_f[:split_index]
    y_train = y_f[:split_index]
    x_test = x_f[split_index:]
    y_test = y_f[split_index:]
    
    y_train, x_train = balance(y_train, x_train)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model = LogisticRegression(penalty="l2")
        model.fit(x_train, y_train)
        acc = model.score(x_test, y_test)
        accuracies.append(acc)
        pred = model.predict(x_test)
        score = f1_score(y_test, pred)
        f1_scores.append(score)

    print("\t" + str(it) + " iteration")
    print("\t\tAcc: " + str(acc))
    print("\t\tF1: " + str(score))
    it += 1

accuracies = np.array(accuracies)
f1_scores = np.array(f1_scores)
print("ACCURACY: mean=" + str(np.mean(accuracies)) + ", std=" + str(np.std(accuracies)))
print("F1 SCORE: mean=" + str(np.mean(f1_scores)) + ", std=" + str(np.std(f1_scores)))

Running K-folds on LogisticRegression
	1 iteration
		Acc: 0.7530092945299406
		F1: 0.34025234025234025
	2 iteration
		Acc: 0.7661130580527198
		F1: 0.36015006252605253
	3 iteration
		Acc: 0.7530092945299406
		F1: 0.36105636578636185
	4 iteration
		Acc: 0.7645893646198385
		F1: 0.3856858846918489
	5 iteration
		Acc: 0.7527045558433643
		F1: 0.3468812877263582
	6 iteration
		Acc: 0.7583422215450252
		F1: 0.3671189146049481
	7 iteration
		Acc: 0.7520950784702118
		F1: 0.3720571208027789
	8 iteration
		Acc: 0.7539235105896693
		F1: 0.36840046929996095
	9 iteration
		Acc: 0.7595611762913301
		F1: 0.37430610626486915
	10 iteration
		Acc: 0.7731220478439738
		F1: 0.3964329144710175
ACCURACY: mean=0.7586469602316014, std=0.006823697346896833
F1 SCORE: mean=0.3672341466426537, std=0.015805973318882293


In [42]:
from src import costs, gradient_descent
from abc import ABC, abstractmethod

class Cost(ABC):
    @abstractmethod
    def __call__(self, w, x, y):
        pass

    @abstractmethod
    def __call__(self, w, x):
        pass

    @abstractmethod
    def grad(self, w, x, y):
        pass

class MeanSquaredError(Cost):
    def __call__(self, w, x, y):
        return costs.compute_mse(y, x, w)

    def predict(self, w, x):
        return round(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.mse_gradient(y, x, w)

class MeanAbsoluteError(Cost):
    def __call__(self, w, x, y):
        return costs.compute_mae(y, x, w)

    def predict(self, w, x):
        return round(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.mae_gradient(y, x, w)

class LogisticLoss(Cost):
    def __call__(self, w, x, y):
        return costs.compute_log_loss(y, x, w)

    def __call__(self, w, x):
        return costs.sigmoid(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.log_gradient(y, x, w)


class Fitter(ABC):
    def __init__(self, cost, penalty, weight):
        self.cost = cost
        self.penalty = penalty
        self.weight = weight
    
    @abstractmethod
    def fit(self, x, y):
        pass

    @abstractmethod
    def score(self, x, y):
        pass

    def predict(self, x):
        return self.cost(x)

class GradientDescent(Fitter):
    def __init__(self, cost, penalty, max_iter):
        super().__init__(cost, penalty)
        self.max_iter = max_iter

    def fit(self, x, y):
        for n_iter in range(self.max_iter):
            self.weights = self.weights - self.gamma * self.cost.grad() 
        return self.cost(self.weights, x, y)