In [1]:
import os
import joblib
import pandas as pd
import data
import metrics
import app
import numpy as np

In [2]:
def evaluate_model(*, model, metric, X_train, y_train, X_test, y_test,dataset):
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_error = metric(y_train, train_predictions)
    test_error = metric(y_test, test_predictions)
    return {
        "train_predictions": train_predictions,
        "test_predictions": test_predictions,
        "train_error": train_error,
        "test_error": test_error
    }

def evaluate_model_baseline(*, model, metric, X_train, y_train, X_test, y_test,dataset):
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_error = metric(y_train, train_predictions,X_train)
    test_error = metric(y_test, test_predictions,X_test)
    return {
        "train_predictions": train_predictions,
        "test_predictions": test_predictions,
        "train_error": train_error,
        "test_error": test_error
    }

def print_report(*, model, evaluation):
    print(f"Model used:\n\t{model}")
    print(f"Error:\n\ttrain set {evaluation['train_error']}\n\tTest error: {evaluation['test_error']}")

## Baseline

In [3]:
dataset = app._get_dataset({'use_new_data': 'si',
                            'filepath': "./Datasets/Sales_train.csv",
                            'filepath_new_data': "./NewDataset/New_dataset.csv",
                            'process': False})

In [4]:
model_path = os.path.join("models", "Baseline", "model.joblib")
model = joblib.load(model_path)
evaluation = evaluate_model_baseline(
    model=model,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1],
    dataset = dataset
)
print_report(model=model, evaluation=evaluation)

Model used:
	Pipeline(steps=[('baseline',
                 <model.SalesPerCategory object at 0x0000025873C2E1C0>)])
Error:
	train set {'cnt_error': 854165.3511036371, 'total_money': 741121379.3659881}
	Test error: {'cnt_error': 363387.532712415, 'total_money': 545070801.7491974}


# Logistic Regression

In [5]:
dataset = app._get_dataset({'use_new_data': 'si',
                            'filepath': "./Datasets/Sales_train.csv",
                            'filepath_new_data': "./NewDataset/New_dataset.csv",
                            'process': True})

In [6]:
model_path = os.path.join("models", "Logistic_Regression", "model.joblib")
model = joblib.load(model_path)
evaluation = evaluate_model(
    model=model,
    metric=metrics.mean_absolute_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1],
    dataset = dataset
)
print_report(model=model, evaluation=evaluation)

Model used:
	Pipeline(steps=[('logistic-regressor', LogisticRegression(random_state=0))])
Error:
	train set 0.30032137903974937
	Test error: 0.34464922376235907


## gridsearchcv

In [7]:
x = pd.DataFrame([[1,2,3],[2,3,4],[3,4,5],[4,5,6],[5,6,7],[6,7,8],[7,8,9],[8,9,10],[9,10,11],[10,11,12]])
y = pd.DataFrame([1,2,3,4,5,6,7,8,9,10])

In [8]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [11]:
indices = np.arange(10)
yield indices[0:2]

SyntaxError: 'yield' outside function (<ipython-input-11-812fd1b005e7>, line 2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
model = LogisticRegression()
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, x,y, cv=btscv, scoring='r2')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

In [None]:
X_train=dataset["train"][0]