<a href="https://colab.research.google.com/github/DenuraTHEbest/InvestHere/blob/Amna_S/aspiFinal2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from google.colab import drive
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
import joblib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Mount Google Drive
drive.mount('/content/drive')

# Load datasets
aspi_data = pd.read_csv('/content/drive/My Drive/CSE_All_Share_Historical_Data_Processed.csv')
sentiment_data = pd.read_csv('/content/drive/My Drive/daily_weighted_scores_9-12.csv')

# Convert the Date columns to pandas datetime for both datasets
aspi_data['Date'] = pd.to_datetime(aspi_data['Date'])  # Already in YYYY-MM-DD format
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])  # Already in YYYY-MM-DD format

# Filter for the common date range
common_start_date = max(aspi_data['Date'].min(), sentiment_data['Date'].min())
common_end_date = min(aspi_data['Date'].max(), sentiment_data['Date'].max())

aspi_data = aspi_data[(aspi_data['Date'] >= common_start_date) & (aspi_data['Date'] <= common_end_date)]
sentiment_data = sentiment_data[(sentiment_data['Date'] >= common_start_date) & (sentiment_data['Date'] <= common_end_date)]

# Sort both datasets by date in ascending order
aspi_data = aspi_data.sort_values(by='Date')
sentiment_data = sentiment_data.sort_values(by='Date')

# Merge datasets with lagged sentiment features
for lag in range(1, 6):  # 5 days lag
    sentiment_data[f'Lagged_Sentiment_{lag}'] = sentiment_data['Weighted_Sentiment'].shift(lag)

# Drop rows with NaN values introduced by shifting
sentiment_data = sentiment_data.dropna(subset=[f'Lagged_Sentiment_{lag}' for lag in range(1, 6)])

# Merge the sentiment data with the ASPI data
merged_data = pd.merge(aspi_data, sentiment_data[['Date'] + [f'Lagged_Sentiment_{lag}' for lag in range(1, 6)]],
                       on='Date', how='left')

# Create lagged price and volume features
merged_data['Price_Lag10'] = merged_data['Price'].shift(10)
merged_data['Vol_Lag10'] = merged_data['Vol.'].shift(10)
merged_data['Price_Lag20'] = merged_data['Price'].shift(20)
merged_data['Vol_Lag20'] = merged_data['Vol.'].shift(20)
merged_data['Price_Lag30'] = merged_data['Price'].shift(30)
merged_data['Vol_Lag30'] = merged_data['Vol.'].shift(30)

# Create target variables for the next 23 days
for i in range(1, 24):
    merged_data[f'Target_{i}'] = merged_data['Price'].shift(-i)

# Remove rows with NaN values (from shifting)
merged_data = merged_data.dropna()

# Features (X) and Target (y)
X = merged_data.drop(columns=['Date', 'Price'] + [f'Target_{i}' for i in range(1, 24)])
y = merged_data[[f'Target_{i}' for i in range(1, 24)]]

print("Input feature columns:", X.columns.tolist())

# Split the data into train (80%) and test (20%) while preserving time order
split_ratio = 0.8
split_index = int(len(X) * split_ratio)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

# Initialize and train Multi-Output RandomForestRegressor
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

print("Model training completed.")

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model performance using Mean Absolute Error (MAE) for each day ahead
mae_scores = [mean_absolute_error(y_test.iloc[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]

# Evaluate model performance using r2
from sklearn.metrics import r2_score
r2_scores = [r2_score(y_test.iloc[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]

# Print MAE results
for i, mae in enumerate(mae_scores, 1):
    print(f"MAE for day {i}: {mae}")

for i, r2 in enumerate(r2_scores, 1):
    print(f"R2 for day {i}: {r2}")

# Save the trained model
joblib.dump(model, "aspi_forecast_model.pkl")

# Download the model to local machine
from google.colab import files
files.download("aspi_forecast_model.pkl")

print("Model saved and downloaded successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Input feature columns: ['Open', 'High', 'Low', 'Vol.', 'Change %', 'Price_Lag1', 'Vol_Lag1', 'Price_MA_5', 'Price_MA_10', 'Price_MA_20', 'Price_Volatility_5', 'Price_Volatility_20', 'Lagged_Sentiment_1', 'Lagged_Sentiment_2', 'Lagged_Sentiment_3', 'Lagged_Sentiment_4', 'Lagged_Sentiment_5', 'Price_Lag10', 'Vol_Lag10', 'Price_Lag20', 'Vol_Lag20', 'Price_Lag30', 'Vol_Lag30']
Training Data: (601, 23), Testing Data: (151, 23)
Model training completed.
MAE for day 1: 15.147444370860917
MAE for day 2: 92.14205960264667
MAE for day 3: 90.28729801324303
MAE for day 4: 94.11947218543091
MAE for day 5: 111.4937662251671
MAE for day 6: 145.63355231788046
MAE for day 7: 122.31557880794668
MAE for day 8: 119.09969735099291
MAE for day 9: 130.63780993377426
MAE for day 10: 121.62661721854315
MAE for day 11: 111.0879503311265
MAE for day 12: 124.74777350993394
MAE for day 1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model saved and downloaded successfully!
