In [119]:
import pandas as pd
import numpy as np
import scipy as sc

from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression, GammaRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

from statistics import linear_regression

In [120]:
df = pd.DataFrame()

for i in range(1, 4):
    tmp = pd.read_csv(f"./csvs/observations_round_4_day_{i}.csv")
    df = pd.concat([df, tmp])

df.head()

Unnamed: 0,timestamp,bidPrice,askPrice,transportFees,exportTariff,importTariff,sugarPrice,sunlightIndex
0,0,627.0,628.5,1.0,9.0,-3.0,200.0,60.0
1,100,630.0,631.5,1.0,9.0,-3.0,200.098445,60.0
2,200,630.0,631.5,1.0,9.0,-3.0,200.103915,60.0
3,300,632.0,633.5,1.0,9.0,-3.0,200.352462,60.01
4,400,630.5,632.0,1.0,9.0,-3.0,200.201545,60.01


In [121]:
df['midPrice'] = (df.bidPrice + df.askPrice) / 2
n_rows = len(df)

In [122]:
cols_x = ['transportFees', 'exportTariff', 'importTariff', 'sugarPrice', 'sunlightIndex']
cols_y = ['midPrice']

In [173]:
split = 0.9
x_train = df[cols_x].head(int(n_rows * split)).to_numpy()
y_train = df[cols_y].head(int(n_rows * split)).to_numpy()
x_test = df[cols_x].tail(int(n_rows * (1 - split))).to_numpy()
y_test = df[cols_y].tail(int(n_rows * (1 - split))).to_numpy()

In [174]:
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

In [175]:
model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train.ravel())
y_hat = model.predict(x_test)

print("coefs:", model.coef_)
print("intercept:", model.intercept_)

r2 = r2_score(y_test, y_hat)
print("R2 score:", r2)

mape = mean_absolute_percentage_error(y_test, y_hat)
print("MAPE:", mape)

coefs: [ 18.91411857 -38.41883619 -40.09542125  26.18971415 -32.53090046]
intercept: 648.6094444444443
R2 score: -0.269661432457025
MAPE: 0.0688427600253312


In [82]:
ind = 0

beta, intercept = linear_regression(x_train[:, ind], y_train)
y_hat = beta * x_test[:, ind] + intercept

print("coefs:", beta, intercept)

r2 = r2_score(y_test, y_hat)
print("R2 score:", r2)

mape = mean_absolute_percentage_error(y_test, y_hat)
print("MAPE:", mape)

coefs: -14.144575377750616 648.6094444444444
R2 score: -5.201337038002343
MAPE: 0.18245184701624667


In [107]:
cols_x = ['transportFees', 'exportTariff', 'importTariff', 'sugarPrice', 'sunlightIndex']
cols_y = ['bidPrice']

split = 0.9
x_train = df[cols_x].head(int(n_rows * split)).to_numpy()
y_train = df[cols_y].head(int(n_rows * split)).to_numpy()
x_test = df[cols_x].tail(int(n_rows * (1 - split))).to_numpy()
y_test = df[cols_y].tail(int(n_rows * (1 - split))).to_numpy()

scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train.ravel())
y_hat = model.predict(x_test)

print("coefs:", model.coef_)

r2 = r2_score(y_test, y_hat)
print("R2 score:", r2)

mape = mean_absolute_percentage_error(y_test, y_hat)
print("MAPE:", mape)

coefs: [ 18.91418741 -38.41878686 -40.09578986  26.19004229 -32.53037611]
R2 score: -0.26959398823513214
MAPE: 0.0689038486036644


In [115]:
cols_x = ['transportFees', 'exportTariff', 'importTariff', 'sugarPrice', 'sunlightIndex']
cols_y = ['bidPrice']

split = 0.9
x_train = df[cols_x].head(int(n_rows * split)).to_numpy()
y_train = df[cols_y].head(int(n_rows * split)).to_numpy()
x_test = df[cols_x].tail(int(n_rows * (1 - split))).to_numpy()
y_test = df[cols_y].tail(int(n_rows * (1 - split))).to_numpy()

scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

model = MLPRegressor(hidden_layer_sizes=(128, 32, 8), max_iter=1000)
model.fit(x_train, y_train.ravel())
y_hat = model.predict(x_test)

r2 = r2_score(y_test, y_hat)
print("R2 score:", r2)

mape = mean_absolute_percentage_error(y_test, y_hat)
print("MAPE:", mape)

R2 score: -0.4529993086643578
MAPE: 0.07692443086696756


In [257]:
split = 0.99
shift = 100

df_shifted = df.copy()
df_shifted[['bidPrice_shifted', 'askPrice_shifted', 'midPrice_shifted']] = df_shifted[['bidPrice', 'askPrice', 'midPrice']].shift(-shift)
df_shifted.dropna(inplace=True)
n_rows = len(df_shifted)
print("Number of points:", n_rows)

cols_x = ['transportFees', 'exportTariff', 'importTariff', 'sugarPrice', 'sunlightIndex']
cols_y_shifted = ['bidPrice_shifted']

x_train = df_shifted[cols_x].head(int(n_rows * split)).to_numpy()
y_train = df_shifted[cols_y_shifted].head(int(n_rows * split)).to_numpy()
x_test = df_shifted[cols_x].tail(int(n_rows * (1 - split))).to_numpy()
y_test = df_shifted[cols_y_shifted].tail(int(n_rows * (1 - split))).to_numpy()

model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train.ravel())
y_hat = model.predict(x_test)

print("coefs:", model.coef_)
print("intercept:", model.intercept_)

r2 = r2_score(y_test, y_hat)
print("R2 score:", r2)

mape = mean_absolute_percentage_error(y_test, y_hat)
print("MAPE:", mape)

mape_med = np.median(abs(y_test - y_hat) / y_test)
print("MAPE median:", mape_med)

Number of points: 29900
coefs: [ 58.54413036 -64.73096913 -53.46953419   4.38106356  -3.46720391]
intercept: 338.0459383436893
R2 score: -0.1718523196087265
MAPE: 0.017190967783840293
MAPE median: 0.019357208146313306


In [246]:
pd.DataFrame(abs(y_test[:-100] / y_test[100:] - 1)).describe()

Unnamed: 0,0
count,5879.0
mean,0.027058
std,0.02088
min,0.0
25%,0.010457
50%,0.02145
75%,0.039813
max,0.101325


coefs: [ 18.91411857 -38.41883619 -40.09542125  26.18971415 -32.53090046]
intercept: 648.6094444444443
R2 score: -0.269661432457025
MAPE: 0.0688427600253312