In [None]:
# default_exp data
%load_ext lab_black
# nb_black if running in jupyter
%load_ext autoreload
# automatically reload python modules if there are changes in the
%autoreload 2

# Hypotheses Space

***input***: toy dataset from data-notebook

***output***: ML model / simulation / analytics python module

***description:***

A hypotheses space defines the possible machine learning models, simulations or analytics tools applied to your problem. 
In this notebook you define the hypotheses space. If you are doing anyhting more complicated than just fitting existing well defined models such as defined in sklearn,
it is recommended that you create a base class for the whole hypotheses space where you define core function handles,
and implement the functions in subclasses that inherit from the base class. If the methods are complicated or you are 
comparing multiple methods that are inheritantly different by nature, you can separate models or subclasses to different notebooks similar to this.
Adjust the running number in the beginning of the notebook name to your needs for retaining logical order.
You should also unit test the classes created in this notebook with either toy data or small sample of your training data.
Remember to add `# export` to top of all cells containing functions or classes that you have defined and want to use outside this notebook.



In [None]:
# hide
from nbdev.showdoc import *

## Import relevant modules

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    train_test_split,
    StratifiedKFold,
)
from sklearn.pipeline import Pipeline, make_pipeline

## Import toy data for testing

In [None]:
toy_df = pd.read_csv(
    "data/preprocessed_data/dataset_toy_switzerland_cleveland.csv", index_col=0
)
toy_df

Unnamed: 0,x1age,x4trestbps,x8thalach,x10oldpeak,y1num
0,50.0,120.0,158.0,1.6,0
1,57.0,110.0,126.0,1.5,0
2,59.0,170.0,140.0,3.4,1
3,41.0,125.0,176.0,1.6,1
4,62.0,120.0,134.0,-0.8,1
5,63.0,150.0,154.0,3.7,1
6,51.0,94.0,154.0,0.0,0
7,39.0,94.0,179.0,0.0,0
8,58.0,115.0,138.0,0.5,1
9,46.0,100.0,133.0,-2.6,1


## Begin with a simple script before constructing the model class

In [None]:
X = toy_df.iloc[:, :-1]
y = toy_df.iloc[:, -1]

k = 2
seed = 0

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1 / 5, random_state=seed, stratify=y
)

scaler = StandardScaler()
lgr = LogisticRegression()
pipe = Pipeline([("scaler", scaler), ("estimator", lgr)])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))

cv = StratifiedKFold(n_splits=k)
print(cross_val_score(pipe, X_train, y_train, cv=cv))

## optimize
param_grid = {
    "estimator__C": np.logspace(-4, 4, 10),
}

pipe_lgr = pipe  # make_pipeline(Imputer(),StandardScaler(),PCA(n_components=2),SVC(random_state=1))

# cv = StratifiedKFold(n_splits=5)
gs = GridSearchCV(
    estimator=pipe_lgr,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    return_train_score=True,
)
gs.fit(X_train, y_train)

print("Best Estimator: \n{}\n".format(gs.best_estimator_))
print("Best Parameters: \n{}\n".format(gs.best_params_))
print("Best Test Score: \n{}\n".format(gs.best_score_))
print(
    "Best Training Score: \n{}\n".format(
        gs.cv_results_["mean_train_score"][gs.best_index_]
    )
)
print("All Training Scores: \n{}\n".format(gs.cv_results_["mean_train_score"]))
print("All Test Scores: \n{}\n".format(gs.cv_results_["mean_test_score"]))
# # This prints out all results during Cross-Validation in details
# print("All Meta Results During CV Search: \n{}\n".format(gs.cv_results_))

0.5
[0.5  0.25]
Best Estimator: 
Pipeline(steps=[('scaler', StandardScaler()),
                ('estimator', LogisticRegression(C=166.81005372000558))])

Best Parameters: 
{'estimator__C': 166.81005372000558}

Best Test Score: 
0.5

Best Training Score: 
1.0

All Training Scores: 
[0.875 0.875 0.875 0.875 0.875 1.    1.    1.    1.    1.   ]

All Test Scores: 
[0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.5   0.5   0.5  ]



## Define base class for hypotheses space / simulations / analytics

> How to create tidy class functions?
- If the function performs a transformation on data, it should return the transformation. A transformation does not make changes to the model attributes.
- If the function performs a side-effect, it should return reference to self, so that the functions can be piped. A side effect makes changes to the model attributes.
- Transformations and side effects should not be mixed. A function should only perform one.

For example the pipe `model.optimize().predict(X, y)` first performs a side effect `.optimize()` and then a transformation `.predict(X, y)`.
The optimization function changes model parameters permanently, and the predict function only makes a transformation on the input data.
However, a side effect may require input and a transformation does not always require input. 

In [None]:
# export
class MachineLearningModel():
    """
    Overly simplified example for a base class: basically just function name definitions
    """
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def _set_data(self, X, y):
        pass
    def get_data(self):
        pass
    def _create_train_test_data(self):
        pass
    def get_train_test_data(self):
        pass
    def fit(self, X = None, y = None, **fit_params):
        pass
    def predict(self, X, y):
        pass
    def score(self):
        pass
    def optimize(self):
        pass
    def get_optimized_params(self):
        pass


## Unit test base class

## Define subclasses & functions

you should also define the loss function used for model fitting

don't forget to unit test!

In [None]:
# export
class LogisticRegressionClassifier(MachineLearningModel):
    """
    Logistic regression classifier
    """
    def __init__(self, X, y):

        self.super(LogisticRegressionClassifier).__init__(X, y)

        self.k = 2 # k-fold 
        self.seed = 0

        self._create_train_test_data(X, y)

        self.optimized_params = None

        self.train_score = None
        self.test_score = None

        self.scaler = StandardScaler()
        self.model = LogisticRegression()
        self.pipe = Pipeline([("scaler", self.scaler), ("estimator", self.model)])

        self.fit()

        self.cv = StratifiedKFold(n_splits=k)

        # param grid for optimization
        self.param_grid = {
            "estimator__C": np.logspace(-4, 4, 10),
        }

        # define optimization method for optimizing the model
        self.optimization_pipe = GridSearchCV(
            estimator=self.pipe,
            param_grid=self.param_grid,
            scoring="accuracy",
            cv=self.cv,
            return_train_score=True,
        )

    def _set_data(self, X, y):
        """
        Set traing and evaluation data
        """
        self.X = X.copy()
        self.y = y.copy()

        return self

    def get_data(self) -> np.ndarray, np.ndarray:
        """
        Get training and evaluation data
        """
        return self.X.copy(), self.y.copy()

    def _create_train_test_data(self, k = None, seed = None):
        """
        Recreate training and testing data
        """
        if seed is None:
            seed = self.seed
        if k is None:
            k = self.k

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=(1 / k), random_state=seed, stratify=self.y
        )

        return self
    
    def get_train_test_data(self):
        """
        Return X_train, X_test, y_train, y_test
        """
        return self.X_train, self.X_test, self.y_train, self.y_test

    def fit(self, X = None, y = None, **fit_params):
        """
        Train and evaluate model
        """
        if X is None or y is None:
            self.pipe.fit(self.X_train, self.y_train, fit_params)
            
        else: # reset data, recreate training and testing data and call fit again
            self._set_data(X,y)
            self._create_train_test_data()
            self.fit(fit_params = fit_params)

        return self

    def predict(self, X, y):
        """
        Get predicted value at X
        """
        return self.pipe.predict(X, y)

    def score(self)->dict:
        """
        Return score (evaluation metric) for train and test data
        """

        self.train_score = pipe.score(self.X_train, self.y_train)
        self.test_score = pipe.score(self.X_test, self.y_test)
        
        return {
            "train_score": self.train_score.copy(),
            "test_score": self.test_score.copy(),
        }

    def optimize(self):
        """
        Optimize model hyperparameters and fit the model with optimized parameters.

        This example is with GridSearchCV, but more efficient algorithms can be implemented in practice.
        """
        self.optimization_pipe.fit(self.X_train, self.y_train)
    
        self.optimized_params = self.optimization_pipe.best_params_

        self.fit(fit_params= self.optimized_params)

        return self
    
    def get_optimized_params(self):

        return self.optimized_params


In [None]:
### gs = optimization pipe
#print("Best Estimator: \n{}\n".format(gs.best_estimator_))
#print("Best Parameters: \n{}\n".format(gs.best_params_))
#print("Best Test Score: \n{}\n".format(gs.best_score_))
#print(
#    "Best Training Score: \n{}\n".format(
#        gs.cv_results_["mean_train_score"][gs.best_index_]
#    )
#)
#print("All Training Scores: \n{}\n".format(gs.cv_results_["mean_train_score"]))
#print("All Test Scores: \n{}\n".format(gs.cv_results_["mean_test_score"]))
# # This prints out all results during Cross-Validation in details
# print("All Meta Results During CV Search: \n{}\n".format(gs.cv_results_))


## Unit test subclasses

## Visualize model behaviour with toy data

## Output of this notebook

The result of this notebook is a collection methods ready for evaluation with the real data.

the methods should be exported with `nbdev_build_lib`, but in the future this will be automatically handled by the pipeline