In [34]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
# bayestian hyperparameter tunning
import optuna
# models to try
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor
from xgboost import XGBRegressor
from myvars import input_features
from myclasses import FeatureGenerator, Windsorizer


pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Simulate future data

In [26]:
data = pd.read_csv('../data/processed/data.csv')
data = data.dropna(subset=input_features)

In [27]:
data.query("calendarYear > 2018").to_csv('../data/processed/test.csv', index=False)
data.query("calendarYear <= 2018").to_csv('../data/processed/train.csv', index=False)

# Train on different models

In [29]:
data = pd.read_csv('../data/processed/train.csv')
data = data.drop(columns=['symbol', 'calendarYear'])

#### Train test split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns='target'),
    data.target, 
    test_size=0.25, 
    random_state = 46)
[x.shape for x in [X_train, X_test, y_train, y_test]]

[(6023, 112), (2008, 112), (6023,), (2008,)]

#### Linear Regression

In [None]:
linear_pipe = Pipeline(
    steps = [
        ('feature_generator', FeatureGenerator()),
        ('feature_clipper', Windsorizer()),
        ('normalizer', PowerTransformer()),
        ('linear_regression', LinearRegression()),
    ]
)