In [2]:
# importing dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from feature_engineering import calculate_bollinger_bands, calculate_daily_return, calculate_macd, calculate_sma, average_true_range

In [3]:
# setting up mlflow tracking
import mlflow

# mlflow.set_tracking_uri("sqlite:///mlflow.db")
# mlflow.set_experiment("stock-pred-exp")

%env MLFLOW_TRACKING_URI = sqlite:///mlruns.db

env: MLFLOW_TRACKING_URI=sqlite:///mlruns.db


In [4]:
mlflow.create_experiment(
   name='stock_pred_exp',
   artifact_location='testing_mlflow_artifacts',
   tags={'env': "dev", "version": "1.0.0"}
)

2024/02/07 14:45:46 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/02/07 14:45:46 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

'1'

In [5]:
mlflow.set_experiment(experiment_name='stock_pred_exp')
# OR

"""when running the mlflow and logging the artifacts, 
we can use this code to log to the specific experiment:

with mlflow.start_run(run_name="blah_blah", experiment_id = experiment.experiment_id) as run:


"""

'when running the mlflow and logging the artifacts, \nwe can use this code to log to the specific experiment:\n\nwith mlflow.start_run(run_name="blah_blah", experiment_id = experiment.experiment_id) as run:\n\n\n'

In [6]:
def read_dataframe(filename):
    df = pd.read_csv(filename, index_col='Datetime')
    return df

In [7]:
train_data = read_dataframe('../Data/EURUSD=X_5m.csv')
test_data  = read_dataframe('../Data/AAPL_5m.csv')

# Defines Dependent and Independent Variables
y_train = train_data[['Adj Close']]
y_test = test_data[['Adj Close']]

### Feature Engineering

In [8]:
# FEATURE ENGINEERING ON TRAIN AND TEST DATA
#Feature Engineering on train_set
atr_data = average_true_range(train_data)
macd_data = calculate_macd(train_data)
dr_data = calculate_daily_return(train_data)
bb_data = calculate_bollinger_bands(train_data)
sma_data = calculate_sma(train_data)
train_sets = sma_data


#Feature Engineering on train_set
atr_data = average_true_range(test_data)
macd_data = calculate_macd(test_data)
dr_data = calculate_daily_return(test_data)
bb_data = calculate_bollinger_bands(test_data)
sma_data = calculate_sma(test_data)
test_sets = sma_data

train_sets

Unnamed: 0_level_0,Open,Volume,ATR,MACD_Line,Signal_Line,MACD_Histogram,Daily Return,Upper Band,Lower Band,Simple Moving Avg
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-03-14 00:00:00+00:00,1.072731,0,,0.000000,0.000000,0.000000,,,,1.072041
2023-03-14 00:05:00+00:00,1.071926,0,,-0.000046,-0.000009,-0.000037,-0.053575,1.072566,1.070942,1.071754
2023-03-14 00:10:00+00:00,1.071582,0,,-0.000054,-0.000018,-0.000036,0.032154,1.072351,1.071195,1.071773
2023-03-14 00:15:00+00:00,1.071697,0,,-0.000050,-0.000024,-0.000026,0.010722,1.072308,1.071315,1.071811
2023-03-14 00:20:00+00:00,1.071926,0,,-0.000028,-0.000025,-0.000003,0.021441,1.072409,1.071351,1.071880
...,...,...,...,...,...,...,...,...,...,...
2023-06-05 16:05:00+01:00,1.071467,0,0.000590,0.000662,0.000701,-0.000039,-0.010703,1.073630,1.067999,1.070815
2023-06-05 16:10:00+01:00,1.071237,0,0.000394,0.000625,0.000686,-0.000061,0.010704,1.073585,1.068330,1.070958
2023-06-05 16:15:00+01:00,1.071467,0,0.000361,0.000579,0.000665,-0.000085,-0.010703,1.073516,1.068651,1.071084
2023-06-05 16:20:00+01:00,1.071467,0,0.000361,0.000556,0.000643,-0.000087,0.021419,1.073396,1.069057,1.071227


In [9]:
# DATA PREPROCESSING
train_dataset = pd.merge(train_sets, y_train, on='Datetime')
train_dataset = train_dataset.dropna()

test_dataset = pd.merge(test_sets, y_test, on='Datetime')
test_dataset = test_dataset.dropna()

In [10]:
x_train = train_dataset.drop(['Open', 'Adj Close', 'Volume'], axis=1)
y_train = train_dataset[['Adj Close']]

x_test = test_dataset.drop(['Open', 'Adj Close', 'Volume'], axis=1)
y_test = test_dataset[['Adj Close']]

# Dropping The Last Row of the x_train and the first row of the y_train  
x_train = x_train.drop(x_train.index[-1])
y_train = y_train.drop(y_train.index[0])

# Dropping The Last Row of the x_test and the first row of the y_test
x_test = x_test.drop(x_test.index[-1])
y_test = y_test.drop(y_test.index[0])

### MODEL ARCH

In [24]:
with mlflow.start_run():
    
    mlflow.sklearn.autolog()
    
    # logging parameters
    mlflow.log_param("train-data-path", "../Data/EURUSD=X_5m.csv")
    mlflow.log_param("test-data-path", "../Data/AAPL_5m.csv")
    mlflow.set_tag("developer", "Enoch")
    
    model = LinearRegression()
    degree = 1
    linear_model = make_pipeline( StandardScaler(), model)
    linear_model.fit(x_train, y_train)
    
    mlflow.set_tag('model', 'LinearRegressor')
    # MAKE PREDICTIONS AND CHECK THE VARIANCE AND BIAS VALUES
    train_pred = linear_model.predict(x_train)
    test_pred = linear_model.predict(x_test)
    
    train_mse = mean_squared_error(y_train, train_pred, squared=False)
    test_mse = mean_squared_error(y_test, test_pred, squared=False)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    mlflow.log_metric("rmse", test_mse)
    mlflow.log_metric("r2", test_r2)
    
    #mlflow.log_artifact(local_path="../models/Linear_Reg.pkl", artifact_path="models_pickle")

In [25]:
print('train mse:', train_mse)
print('train r2:', train_r2)
print('test mse:', test_mse)
print('test r2:', test_r2)

train mse: 0.00035554292608154454
train r2: 0.9991575178139349
test mse: 0.30991165941859333
test r2: 0.9982911037294748


#### Ridge Architecture

In [26]:
with mlflow.start_run():
    
    mlflow.sklearn.autolog()
    
    # logging parameters
    mlflow.log_param("polynomial degree", 1)
    mlflow.log_param("alpha", 0.1)
    
    degree = 1
    alpha = 0.1  # Regularization strength
    ridge_model = make_pipeline(PolynomialFeatures(degree=degree), StandardScaler(), Ridge(alpha=alpha))
    ridge_model.fit(x_train, y_train)
    
    # MAKE PREDICTIONS AND CHECK THE VARIANCE AND BIAS VALUES
    train_pred = ridge_model.predict(x_train)
    test_pred = ridge_model.predict(x_test)
    mlflow.set_tag('model', 'RidgeRegressor')
    
    train_mse = mean_squared_error(y_train, train_pred, squared=False)
    test_mse = mean_squared_error(y_test, test_pred, squared=False)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    mlflow.log_metric("rmse", test_mse)
    mlflow.log_metric("r2", test_r2)

In [27]:


print('train mse:', train_mse)
print('train r2:', train_r2)
print('test mse:', test_mse)
print('test r2:', test_r2)

train mse: 0.0003555427720491671
train r2: 0.9991575185439143
test mse: 0.30955395242571454
test r2: 0.9982950463456918


### Lasso Architecture

In [28]:
with mlflow.start_run():
    
    mlflow.sklearn.autolog()
    
    # logging parameters
    #mlflow.log_param("train-data-path", "../Data/EURUSD=X_5m.csv")
    #mlflow.log_param("test-data-path", "../Data/AAPL_5m.csv")
    
    
    model = LinearRegression()
    degree = 1
    alpha = 0.00001  # Regularization strength
    lasso_model = make_pipeline(PolynomialFeatures(degree=degree), StandardScaler(), Lasso(alpha=alpha))
    lasso_model.fit(x_train, y_train)
    
    mlflow.set_tag('model', 'LassoRegressor')
    # MAKE PREDICTIONS AND CHECK THE VARIANCE AND BIAS VALUES
    train_pred = lasso_model.predict(x_train)
    test_pred = lasso_model.predict(x_test)
    
    train_mse = mean_squared_error(y_train, train_pred, squared=False)
    test_mse = mean_squared_error(y_test, test_pred, squared=False)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    mlflow.log_metric("rmse", test_mse)
    mlflow.log_metric("r2", test_r2)

In [29]:


print('train mse:', train_mse)
print('train r2:', train_r2)
print('test mse:', test_mse)
print('test r2:', test_r2)

train mse: 0.0003560888486147838
train r2: 0.999154928629463
test mse: 0.35106833861197767
test r2: 0.9978070779264776


### Stochastic Gradient Descent Regressor with Ridge Regularization

In [30]:
from sklearn.linear_model import SGDRegressor
with mlflow.start_run():
    
    mlflow.sklearn.autolog()
     # logging parameters
    #mlflow.log_param("train-data-path", "../Data/EURUSD=X_5m.csv")
    #mlflow.log_param("test-data-path", "../Data/AAPL_5m.csv")

    degree = 1      # 1
    alpha = 0.002  # Regularization strength 0.0001
    max_iter = 115  # 120

    SGD_model = make_pipeline(PolynomialFeatures(degree=degree), StandardScaler(), SGDRegressor(alpha=alpha, max_iter=max_iter))
    SGD_model.fit(x_train, y_train)
    mlflow.set_tag('model', 'SGDRegressor')
    
    
    # MAKE PREDICTIONS AND CHECK THE VARIANCE AND BIAS VALUES
    train_pred = SGD_model.predict(x_train)
    test_pred = SGD_model.predict(x_test)
    
    train_mse = mean_squared_error(y_train, train_pred, squared=False)
    test_mse = mean_squared_error(y_test, test_pred, squared=False)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    mlflow.log_metric("rmse", test_mse)
    mlflow.log_metric("r2", test_r2)


  y = column_or_1d(y, warn=True)


In [31]:
print('train mse:', train_mse)
print('train r2:', train_r2)
print('test mse:', test_mse)
print('test r2:', test_r2)

train mse: 0.00035738209390664787
train r2: 0.9991487792139596
test mse: 0.31108878493922765
test r2: 0.9982780974058042


In [11]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope


train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_test, label=y_test)

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2   = r2_score(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2-score", r2)

    return {'loss': rmse, 'r2-score': r2, 'status': STATUS_OK}

In [15]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:165.84187                         
[1]	validation-rmse:165.78875                         
[2]	validation-rmse:165.74063                         
[3]	validation-rmse:165.69743                         
[4]	validation-rmse:165.65837                         
[5]	validation-rmse:165.62338                         
[6]	validation-rmse:165.59167                         
[7]	validation-rmse:165.56283                         
[8]	validation-rmse:165.53713                         
[9]	validation-rmse:165.51361                         
[10]	validation-rmse:165.49273                        
[11]	validation-rmse:165.47354                        
[12]	validation-rmse:165.45649                        
[13]	validation-rmse:165.44096                        
[14]	validation-rmse:165.42685                        
[15]	validation-rmse:165.41432                        
[16]	validation-rmse:165.40294                        
[17]	validation-rmse:165.39276                        
[18]	valid