In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
import pickle
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_aapl = pd.read_csv("AAPL.csv")
df_googl = pd.read_csv("GOOGL.csv")
df_meta = pd.read_csv("META.csv")
df_nvda = pd.read_csv("NVDA.csv")
df_tsla = pd.read_csv("TSLA.csv")
df_aapl["Date"] = pd.to_datetime(df_aapl["Date"], format="%Y-%m-%d")
df_googl["Date"] = pd.to_datetime(df_googl["Date"], format="%Y-%m-%d")
df_meta["Date"] = pd.to_datetime(df_meta["Date"], format="%Y-%m-%d")
df_nvda["Date"] = pd.to_datetime(df_nvda["Date"], format="%Y-%m-%d")
df_tsla["Date"] = pd.to_datetime(df_tsla["Date"], format="%Y-%m-%d")
df_aapl["Volume"] = df_aapl["Volume"].astype(float)
df_googl["Volume"] = df_googl["Volume"].astype(float)
df_meta["Volume"] = df_meta["Volume"].astype(float)
df_nvda["Volume"] = df_nvda["Volume"].astype(float)
df_tsla["Volume"] = df_tsla["Volume"].astype(float)
df = pd.merge(df_aapl, df_googl, on="Date", suffixes=("", "_googl"))
df = pd.merge(df, df_meta, on="Date", suffixes=("", "_meta"))
df = pd.merge(df, df_nvda, on="Date", suffixes=("", "_nvda"))
df = pd.merge(df, df_tsla, on="Date", suffixes=("_aapl", "_tsla"))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
split = df["Date"][(len(df["Date"])*15)//20]
names = df.columns
y_names = ["Date"]
for val in names:
    if val[0] == "O" or val[0] == "C":
        y_names.append(val)
no_date = names[1:]
no_date_y = y_names[1:]
X = df.copy()
y = df.copy()[y_names]
X_train = pd.DataFrame(columns=X.columns)
X_test = pd.DataFrame(columns=X.columns)
X_test["X_date"] = pd.Series(dtype=X["Date"].dtype)
X_train = X[:100]
X_train["X_date"] = X["Date"][99]
prev_val = X["Date"][99]
for val in X["Date"][100:-10]:
    if val < split:
        X_train_new = X_train[X_train["X_date"] == prev_val][1:]
        X_train_new = pd.concat([X_train_new, X[X["Date"] == val]])
        X_train_new["X_date"] = val
        X_train = pd.concat([X_train, X_train_new])
        prev_val = val
    elif val == split:
        X_test = X_train[X_train["X_date"] == prev_val][1:]
        X_test = pd.concat([X_test, X[X["Date"] == val]])
        X_test["X_date"] = val
        prev_val = val
    else:
        X_test_new = X_test[X_test["X_date"] == prev_val][1:]
        X_test_new = pd.concat([X_test_new, X[X["Date"] == val]])
        X_test_new["X_date"] = val
        X_test = pd.concat([X_test, X_test_new])
        prev_val = val
X_train = X_train.groupby("X_date").agg(lambda x: list(x))
X_test = X_test.groupby("X_date").agg(lambda x: list(x))
y_train = pd.DataFrame(columns=y.columns)
y_train["X_date"] = pd.Series(dtype=y["Date"].dtype)
y_test = pd.DataFrame(columns=y.columns)
y_test["X_date"] = pd.Series(dtype=y["Date"].dtype)
y_train = y[100:110]
y_train["X_date"] = y["Date"][99]
prev_val = y["Date"][99]
for val in y["Date"][100:-10]:
    if val < split:
        y_train_new = y_train[y_train["X_date"] == prev_val][1:]
        y_train_new = pd.concat([y_train_new, y[y["Date"] == val]])
        y_train_new["X_date"] = val
        y_train = pd.concat([y_train, y_train_new])
        prev_val = val
    elif val == split:
        y_test = y_train[y_train["X_date"] == prev_val][1:]
        y_test = pd.concat([y_test, y[y["Date"] == val]])
        y_test["X_date"] = val
        prev_val = val
    else:
        y_test_new = y_test[y_test["X_date"] == prev_val][1:]
        y_test_new = pd.concat([y_test_new, y[y["Date"] == val]])
        y_test_new["X_date"] = val
        y_test = pd.concat([y_test, y_test_new])
        prev_val = val
y_train = y_train.groupby("X_date").agg(lambda x: list(x))
y_test = y_test.groupby("X_date").agg(lambda x: list(x))

In [None]:
def create_regressor(n_estimators=1000, max_depth=15, eta=0.1):
    return MultiOutputRegressor(xgb.XGBRegressor(objective="reg:squarederror", n_estimators=n_estimators, max_depth=max_depth, eta=eta))

def fit_regress(X_train, y_train, model):
    y_train_used = np.array(y_train[:][no_date_y].values.tolist())
    y_train_used = y_train_used.reshape((len(y_train_used), 100))
    X_train_used = np.array(X_train[:][no_date].values.tolist())
    X_train_used = X_train_used.reshape((len(X_train_used), 2500))
    return model.fit(X_train_used, y_train_used)

def predict_regress(X_test, model):
    X_test_used = np.array(X_test[:][no_date].values.tolist())
    X_test_used = X_test_used.reshape((len(X_test_used), 2500))
    return model.predict(X_test_used)

def evaluate_regress(X_test, y_test, model):
    y_test_used = np.array(y_test[:][no_date_y].values.tolist())
    y_test_used = y_test_used.reshape((len(y_test_used), 100))
    return mean_squared_error(predict_regress(X_test, model), y_test_used)

def regressor_grid(n_estimators_lst=[1000], max_depth_lst=[15], eta_lst=[0.1]):
    models = []
    histories = []
    losses = []
    min_loss = float("inf")
    min_loss_index = -1
    min_loss_params = [None, None, None]
    i = 0
    for num in n_estimators_lst:
        for depth in max_depth_lst:
            for eta in eta_lst:
                models.append(create_regressor(num, depth, eta))
                histories.append(fit_regress(X_train, y_train, models[-1]))
                loss = evaluate_regress(X_test, y_test, models[-1])
                losses.append(loss)
                if loss < min_loss:
                    min_loss = loss
                    min_loss_index = i
                    min_loss_params = [num, depth, eta]
                i += 1
    return (models, histories, losses, min_loss_index, min_loss_params)

In [None]:
test_model_3 = create_regressor(10, 10, 0.3)
#test_model_2 = create_regressor(10, 15, 0.3)
#test_model_1 = create_regressor(10, 25, 0.3)
#fit_regress(X_train, y_train, test_model_1)
#print(evaluate_regress(X_test, y_test, test_model_1))
#fit_regress(X_train, y_train, test_model_2)
#print(evaluate_regress(X_test, y_test, test_model_2))
fit_regress(X_train, y_train, test_model_3)
print(evaluate_regress(X_test, y_test, test_model_3))

2878.05342516996


In [None]:
values = regressor_grid([3, 5, 10], [10, 15, 25], [0.1, 0.3, 0.05])

In [None]:
values[2]

[15947.209929872071,
 6898.812230559389,
 20158.77015285705,
 15947.209931262936,
 6898.812231871514,
 20158.770152855133,
 15947.209931262936,
 6898.812231871514,
 20158.770152855133,
 12120.9194109406,
 4356.256219727695,
 17349.516995156162,
 12120.919411743054,
 4356.256220956113,
 17349.51699552873,
 12120.919411743054,
 4356.256220956113,
 17349.51699552873,
 6971.415566102075,
 2878.05342516996,
 12305.137767710143,
 6971.415567624663,
 2878.040173232087,
 12305.13776783725,
 6971.415567624663,
 2878.03884951701,
 12305.13776783725]

In [None]:
#with open('model_2.pkl', 'wb') as f:
  #pickle.dump(test_model_1, f)

#with open('model_3.pkl', 'wb') as f:
  #pickle.dump(test_model_2, f)

with open('model_4.pkl', 'wb') as f:
  pickle.dump(test_model_3, f)

In [None]:
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

y_train_used = np.array(y_train[:][no_date_y].values.tolist())
y_train_used = y_train_used.reshape((len(y_train_used), 100))
X_train_used = np.array(X_train[:][no_date].values.tolist())
X_train_used = X_train_used.reshape((len(X_train_used), 2500))
y_test_used = np.array(y_test[:][no_date_y].values.tolist())
y_test_used = y_test_used.reshape((len(y_test_used), 100))
X_test_used = np.array(X_test[:][no_date].values.tolist())
X_test_used = X_test_used.reshape((len(X_test_used), 2500))

X_scaler.fit_transform(X_train_used)
y_scaler.fit_transform(y_train_used)

X_scaled = X_scaler.transform(X_train_used)
y_scaled = y_scaler.transform(y_train_used)

X_test_scaled = X_scaler.transform(X_test_used)
y_test_scaled = y_scaler.transform(y_test_used)

def fit_regress_scaled(X_train_scaled, y_train_scaled, model):
    return model.fit(X_train_scaled, y_train_scaled)

def predict_regress_scaled(X_test_scaled, model):
    return model.predict(X_test_scaled)

def evaluate_regress_scaled(X_test_scaled, y_test_scaled, model):
    return mean_squared_error(predict_regress_scaled(X_test_scaled, model), y_test_scaled)


def regressor_grid_scaled(n_estimators_lst=[1000], max_depth_lst=[15], eta_lst=[0.1]):
    models = []
    histories = []
    losses = []
    min_loss = float("inf")
    min_loss_index = -1
    min_loss_params = [None, None, None]
    i = 0
    for num in n_estimators_lst:
        for depth in max_depth_lst:
            for eta in eta_lst:
                models.append(create_regressor(num, depth, eta))
                histories.append(fit_regress_scaled(X_scaled, y_scaled, models[-1]))
                loss = evaluate_regress_scaled(X_test_scaled, y_test_scaled, models[-1])
                losses.append(loss)
                if loss < min_loss:
                    min_loss = loss
                    min_loss_index = i
                    min_loss_params = [num, depth, eta]
                i += 1
                print(i)
    return (models, histories, losses, min_loss_index, min_loss_params)

def inverse_scaler_evaluate(model, X_test_scaled, y_test):
    return mean_squared_error(y_scaler.inverse_transform(predict_regress_scaled(X_test_scaled, model)), y_test_used)

In [None]:
values_2 = regressor_grid_scaled([15], [25, 50], [0.3])

1
2


In [None]:
print(inverse_scaler_evaluate(values_2[0][0], X_test_scaled, y_test_used))

2679.6141846507603


In [None]:
with open('model_5.pkl', 'wb') as f:
  pickle.dump(values_2[0][0], f)