In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import joblib

In [2]:
data = pd.read_csv('preprocessed_data.csv')
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,7-day MA,30-day MA,7-day Volatility,30-day Volatility,RSI
0,2014-10-16,0.003230,0.002728,0.003044,0.003034,382.556000,0.000060,0.002711,0.002535,0.003157,0.003236,0.501092
1,2014-10-17,0.003056,0.002534,0.003079,0.003052,383.757996,0.000022,0.002759,0.002496,0.002537,0.002884,0.563976
2,2014-10-18,0.003074,0.002675,0.003133,0.003166,391.441986,0.000016,0.002823,0.002478,0.001476,0.002763,0.744431
3,2014-10-19,0.003182,0.002657,0.003246,0.003138,389.545990,0.000000,0.002847,0.002475,0.001148,0.002754,0.788496
4,2014-10-20,0.003152,0.002601,0.003122,0.003038,382.845001,0.000030,0.002831,0.002461,0.001289,0.002696,0.720995
...,...,...,...,...,...,...,...,...,...,...,...,...
2679,2022-02-15,0.629476,0.648248,0.639166,0.658821,44575.203125,0.064724,0.657307,0.637212,0.225850,0.332232,0.698551
2680,2022-02-16,0.659040,0.646951,0.653750,0.649719,43961.859375,0.056378,0.656482,0.638124,0.211000,0.338845,0.768211
2681,2022-02-17,0.649522,0.640458,0.605309,0.598912,40538.011719,0.074768,0.649861,0.637145,0.291232,0.335681,0.588718
2682,2022-02-18,0.599281,0.593740,0.596070,0.591388,40030.976563,0.066401,0.644661,0.636232,0.365938,0.333842,0.412201


In [3]:
# Prepare data for training
X = data[['Open', 'High', 'Low', 'Volume', 'Adj Close', '7-day MA', '30-day MA', '7-day Volatility', '30-day Volatility', 'RSI']]
y = data['Close']
y

Unnamed: 0,Close
0,0.003034
1,0.003052
2,0.003166
3,0.003138
4,0.003038
...,...
2679,0.658821
2680,0.649719
2681,0.598912
2682,0.591388


In [4]:
# Shift target column to predict next day's closing price
y = y.shift(-1)
data.dropna(inplace=True)
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,7-day MA,30-day MA,7-day Volatility,30-day Volatility,RSI
0,2014-10-16,0.003230,0.002728,0.003044,0.003034,382.556000,0.000060,0.002711,0.002535,0.003157,0.003236,0.501092
1,2014-10-17,0.003056,0.002534,0.003079,0.003052,383.757996,0.000022,0.002759,0.002496,0.002537,0.002884,0.563976
2,2014-10-18,0.003074,0.002675,0.003133,0.003166,391.441986,0.000016,0.002823,0.002478,0.001476,0.002763,0.744431
3,2014-10-19,0.003182,0.002657,0.003246,0.003138,389.545990,0.000000,0.002847,0.002475,0.001148,0.002754,0.788496
4,2014-10-20,0.003152,0.002601,0.003122,0.003038,382.845001,0.000030,0.002831,0.002461,0.001289,0.002696,0.720995
...,...,...,...,...,...,...,...,...,...,...,...,...
2679,2022-02-15,0.629476,0.648248,0.639166,0.658821,44575.203125,0.064724,0.657307,0.637212,0.225850,0.332232,0.698551
2680,2022-02-16,0.659040,0.646951,0.653750,0.649719,43961.859375,0.056378,0.656482,0.638124,0.211000,0.338845,0.768211
2681,2022-02-17,0.649522,0.640458,0.605309,0.598912,40538.011719,0.074768,0.649861,0.637145,0.291232,0.335681,0.588718
2682,2022-02-18,0.599281,0.593740,0.596070,0.591388,40030.976563,0.066401,0.644661,0.636232,0.365938,0.333842,0.412201


In [5]:
# Update X and y after dropping NaN rows
X = data[['Open', 'High', 'Low', 'Volume', 'Adj Close', '7-day MA', '30-day MA', '7-day Volatility', '30-day Volatility', 'RSI']]
y = data['Close']
y

Unnamed: 0,Close
0,0.003034
1,0.003052
2,0.003166
3,0.003138
4,0.003038
...,...
2679,0.658821
2680,0.649719
2681,0.598912
2682,0.591388


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)

In [8]:
# Random Forest
rf_reg = RandomForestRegressor(random_state=42, n_estimators=100)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

In [9]:
# XGBoost
xgb_reg = XGBRegressor(random_state=42, n_estimators=100)
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_test)

In [10]:
# LSTM
X_lstm = np.array(X).reshape(X.shape[0], 1, X.shape[1])
X_train_lstm, X_test_lstm = X_lstm[:len(X_train)], X_lstm[len(X_train):]

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

lstm_model = build_lstm_model((X_train_lstm.shape[1], X_train_lstm.shape[2]))
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=1)
y_pred_lstm = lstm_model.predict(X_test_lstm).flatten()

  super().__init__(**kwargs)


Epoch 1/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.0645
Epoch 2/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0630
Epoch 3/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0624
Epoch 4/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0591
Epoch 5/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0614
Epoch 6/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0608
Epoch 7/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0588
Epoch 8/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0574
Epoch 9/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0592
Epoch 10/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0610
[1m17/17

In [11]:
# Evaluation Metrics
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"{model_name} - MSE: {mse}, MAE: {mae}, RMSE: {rmse}")

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")
evaluate_model(y_test, y_pred_lstm, "LSTM")

Linear Regression - MSE: 1.9079759624663725e-32, MAE: 9.084746036091156e-17, RMSE: 1.3812950309279956e-16
Random Forest - MSE: 3.860246762780477e-06, MAE: 0.00039608918858559107, RMSE: 0.001964751068909361
XGBoost - MSE: 1.1650555165882558e-05, MAE: 0.001291405273320746, RMSE: 0.0034132909582809605
LSTM - MSE: 0.051530624161792805, MAE: 0.15915740646951257, RMSE: 0.22700357742069355


In [12]:
# Save models
joblib.dump(lin_reg, 'linear_regression_model.pkl')
joblib.dump(rf_reg, 'random_forest_model.pkl')
joblib.dump(xgb_reg, 'xgboost_model.pkl')
lstm_model.save('lstm_model.h5')

print("Models trained and saved successfully.")



Models trained and saved successfully.
