In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# This notebook should be run under "scripts/"

import sys
import os

PATH = os.getcwd()
for _ in range(2):
    PATH = os.path.dirname(PATH)

if PATH not in sys.path:
    sys.path.append(PATH)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


from BusinessAnalyticsProject.dataset.feature_engineering import train_test_split
from BusinessAnalyticsProject.models.model_selection import (
    create_pipeline,
    hyperparam_tuning,
    metrics,
    benchmark,
    OLS,RR
)
# from BusinessAnalyticsProject.configs.RF import CONFIG as RF_CONFIG

from BusinessAnalyticsProject.models.model_interpretation import (
    create_benchmark_plot,
    create_benchmark_plot_with_p,
    create_feature_importance_plot
)

In [4]:
sns.set_theme()

## Read in Features

In [5]:
all_train_df = pd.read_csv("../data/processed/all_train.csv", index_col=[0], parse_dates=["Date"])
all_test_df = pd.read_csv("../data/processed/all_test.csv", index_col=[0], parse_dates=["Date"])

X_train, y_train, weights_train, X_test, y_test, weights_test = train_test_split(all_train_df)
X_train.shape, y_train.shape, weights_train.shape, X_test.shape, y_test.shape, weights_test.shape

((294132, 15), (294132,), (294132,), (127438, 15), (127438,), (127438,))

# Ridge Regression 岭回归

In [6]:
pipeline_data = create_pipeline(
    None,
    None,
    if_data_normalization = True,
    if_to_one_hot=True
)
pipeline_data

In [7]:
results, metrics_val, ret = RR(
    pipeline_data,
    X_train, y_train, weights_train,
    X_test, y_test, weights_test
)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [8]:
metrics_val

{'r2_score': 0.6528127240466799,
 'rmse': 13033.947995451934,
 'rwmse': 5127111.253333275}

# Hyper-Tuning (RR)

In [7]:
from BusinessAnalyticsProject.configs.RR import CONFIG as RR_CONFIG
RR_CONFIG

{'model_cls': sklearn.linear_model._ridge.Ridge,
 'params': {'alpha': 1},
 'param_space': {'alpha': [1e-15,
   1e-12,
   1e-10,
   1e-05,
   0.001,
   0.01,
   0.05,
   0.08,
   0.1,
   0.2,
   0.3,
   0.4,
   0.5,
   1]}}

In [10]:
pipeline = create_pipeline(
    RR_CONFIG["model_cls"],
    RR_CONFIG["params"],
    if_winsorization = False,
    if_data_normalization = True,
    if_to_one_hot = True
)
pipeline

In [11]:
opt, metrics_val = hyperparam_tuning(
    pipeline,
    RR_CONFIG["param_space"],
    X_train,
    y_train,
    weights_train,
    X_test,
    y_test,
    weights_test,
    save_dir="../outputs/models/RR/"
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits


# Lasso Regression

In [9]:
from BusinessAnalyticsProject.models.model_selection import LSO

In [10]:
pipeline_data = create_pipeline(
    None,
    None,
    if_winsorization = False,
    if_data_normalization = True,
    if_to_one_hot = True
)
pipeline_data

In [11]:
results, metrics_val, r2 = LSO(
    pipeline_data,
    X_train, y_train, weights_train,
    X_test, y_test, weights_test
)

  model = cd_fast.enet_coordinate_descent(


In [12]:
metrics_val

{'r2_score': 0.6528360344674335,
 'rmse': 13033.510433460378,
 'rwmse': 5126962.560981997}

# Hyper-Tuning (Lasso)

In [14]:
from BusinessAnalyticsProject.configs.LSO import CONFIG as LSO_CONFIG
LSO_CONFIG

{'model_cls': sklearn.linear_model._coordinate_descent.Lasso,
 'params': {'alpha': 0.001},
 'param_space': {'alpha': [1e-10, 1e-08, 1e-06, 1e-05, 0.0001]}}

In [16]:
pipeline = create_pipeline(
    LSO_CONFIG["model_cls"],
    LSO_CONFIG["params"],
    if_winsorization=False,
    if_data_normalization=True,
    if_to_one_hot=True
)
pipeline

In [17]:
opt, metrics_val = hyperparam_tuning(
    pipeline,
    LSO_CONFIG["param_space"],
    X_train,
    y_train,
    weights_train,
    X_test,
    y_test,
    weights_test,
    save_dir="../outputs/models/LSO/"
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits


  model = cd_fast.sparse_enet_coordinate_descent(
