In [254]:
import numpy as np
import random
import pandas as pd
import sys 

%load_ext autoreload
%autoreload 2

sys.path.append("/home/souly/Desktop/ml/ml-project1")
from src import json_parser, helpers, preprocessing
from src.helpers import split_data_rand
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from src.preprocessing import undefined_to_median, undefined_to_avg, prune_undefined

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [255]:
x_df = pd.read_csv("../data/raw/x_train.csv")
y_df = pd.read_csv("../data/raw/y_train.csv")

features = json_parser.parse_json_file("features.json")

In [256]:
# preprocessing
seed = 42
degree = 1
split_ratio = 0.8

x = x_df.values
y = y_df["_MICHD"].values

x = preprocessing.clean_data(features, x, median_estimator=True, do_poly=False, do_one_hot=True)

split_index = int(len(x)*split_ratio)
x_train = x[:split_index]
y_train = y[:split_index]
x_test = x[split_index:]
y_test = x[split_index:]

positive_indices = np.where(y_train == 1)[0]
negative_indices = np.where(y_train != 1)[0]

min_samples = min(len(positive_indices), len(negative_indices))

random.seed(seed)
downsampled_negative_indices = random.sample(list(negative_indices), min_samples)

balanced_indices = np.concatenate([positive_indices, downsampled_negative_indices])
random.shuffle(balanced_indices)

x_train = x_train[balanced_indices]
y_train = y_train[balanced_indices]

In [257]:
def run(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    print("Accuracy: " + str(model.score(x_test, y_test)))
    pred = model.predict(x_test)
    print("F1: " + str(f1_score(y_test, pred)))

In [258]:
run(LogisticRegression(penalty="l2"), x_train, y_train, x_test, y_test)

Accuracy: 0.7779982743744608
F1: 0.7827041634997044


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [259]:
run(LogisticRegression(penalty="none"), x_train, y_train, x_test, y_test)



Accuracy: 0.7776531492666091
F1: 0.7820718816067653


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [260]:
from src import costs, gradient_descent
from abc import ABC, abstractmethod

class Cost(ABC):
    @abstractmethod
    def __call__(self, w, x, y):
        pass

    @abstractmethod
    def __call__(self, w, x):
        pass

    @abstractmethod
    def grad(self, w, x, y):
        pass

class MeanSquaredError(Cost):
    def __call__(self, w, x, y):
        return costs.compute_mse(y, x, w)

    def predict(self, w, x):
        return round(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.mse_gradient(y, x, w)

class MeanAbsoluteError(Cost):
    def __call__(self, w, x, y):
        return costs.compute_mae(y, x, w)

    def predict(self, w, x):
        return round(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.mae_gradient(y, x, w)

class LogisticLoss(Cost):
    def __call__(self, w, x, y):
        return costs.compute_log_loss(y, x, w)

    def __call__(self, w, x):
        return costs.sigmoid(w.T@x)

    def grad(self, w, x, y):
        return gradient_descent.log_gradient(y, x, w)


class Fitter(ABC):
    def __init__(self, cost, penalty, weight):
        self.cost = cost
        self.penalty = penalty
        self.weight = weight
    
    @abstractmethod
    def fit(self, x, y):
        pass

    @abstractmethod
    def score(self, x, y):
        pass

    def predict(self, x):
        return self.cost(x)

class GradientDescent(Fitter):
    def __init__(self, cost, penalty, max_iter):
        super().__init__(cost, penalty)
        self.max_iter = max_iter

    def fit(self, x, y):
        for n_iter in range(self.max_iter):
            self.weights = self.weights - self.gamma * self.cost.grad() 
        return self.cost(self.weights, x, y)