<a href="https://colab.research.google.com/github/Deebest-maker/ML-stock-forecasting-model/blob/main/stock_forecasting_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üìà Stock Market Forecasting Using LSTM & Random Forest  
This project builds and compares two machine learning models to forecast stock market closing prices:
- **LSTM Deep Learning Model**
- **Random Forest Machine Learning Model**

Both models will be trained on historical stock price data downloaded from Yahoo Finance.


In [None]:
# Install required libraries (run this cell in Colab; Colab already preinstalls most)
!pip install yfinance scikit-learn tensorflow matplotlib -q


In [None]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")


In [None]:
# Download stock data
STOCK_TICKER = "AAPL"  # change to any ticker like "TSLA", "GOOGL", "BTC-USD"
START_DATE = "2015-01-01"
END_DATE = "2024-12-31"

print(f"üìä Downloading {STOCK_TICKER} data...")
data = yf.download(STOCK_TICKER, start=START_DATE, end=END_DATE)
print(f"‚úÖ Downloaded {len(data)} rows of price data")
data.head()


In [None]:
# Display uploaded screenshot (for reference)
from IPython.display import Image, display
display(Image("/mnt/data/75ad14bb-a9fb-46e5-8256-835a32e862fe.png"))


In [None]:
# Visualize closing prices
plt.figure(figsize=(14,5))
plt.plot(data['Close'], label='Close Price')
plt.title(f'{STOCK_TICKER} Stock Price History')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Prepare training and testing data (80% train / 20% test)
df = data[['Close']].dropna()
split_index = int(len(df) * 0.8)
train_data = df[:split_index]
test_data = df[split_index:]
print("Training samples:", len(train_data))
print("Testing samples:", len(test_data))


In [None]:
# LSTM model: prepare sequences, build, train, predict
scaler_lstm = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler_lstm.fit_transform(df)

scaled_train = scaled_data[:split_index]
scaled_test = scaled_data[split_index:]

def create_sequences(data, window=60):
    X, y = [], []
    for i in range(window, len(data)):
        X.append(data[i-window:i, 0])
        y.append(data[i, 0])
    return np.array(X), np.array(y)

WINDOW = 60
X_train_lstm, y_train_lstm = create_sequences(scaled_train, WINDOW)
X_test_lstm, y_test_lstm = create_sequences(scaled_test, WINDOW)

X_train_lstm = X_train_lstm.reshape(-1, WINDOW, 1)
X_test_lstm = X_test_lstm.reshape(-1, WINDOW, 1)

model_lstm = Sequential([
    LSTM(50, return_sequences=True, input_shape=(WINDOW,1)),
    Dropout(0.2),
    LSTM(50, return_sequences=True),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(1)
])

model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Train the model (reduce epochs if you're running on limited runtime)
history = model_lstm.fit(
    X_train_lstm, y_train_lstm,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

pred_lstm = model_lstm.predict(X_test_lstm)
pred_lstm = scaler_lstm.inverse_transform(pred_lstm)
y_actual_lstm = scaler_lstm.inverse_transform(y_test_lstm.reshape(-1,1))


In [None]:
# LSTM evaluation metrics
mae_lstm = mean_absolute_error(y_actual_lstm, pred_lstm)
rmse_lstm = np.sqrt(mean_squared_error(y_actual_lstm, pred_lstm))
r2_lstm = r2_score(y_actual_lstm, pred_lstm)

print("üìä LSTM Metrics:")
print(f"MAE: {mae_lstm:.4f}")
print(f"RMSE: {rmse_lstm:.4f}")
print(f"R2: {r2_lstm:.4f}")


In [None]:
# Random Forest: feature engineering, train, predict
def create_features(df, window=60):
    df2 = df.copy()
    for i in range(1, window+1):
        df2[f'lag_{i}'] = df2['Close'].shift(i)
    df2['roll_mean_7'] = df2['Close'].rolling(7).mean()
    df2['roll_std_7'] = df2['Close'].rolling(7).std()
    df2['roll_mean_30'] = df2['Close'].rolling(30).mean()
    df2['roll_std_30'] = df2['Close'].rolling(30).std()
    return df2.dropna()

df_rf = create_features(df)
split_rf = int(len(df_rf) * 0.8)

X_train_rf = df_rf.iloc[:split_rf, 1:]
y_train_rf = df_rf.iloc[:split_rf, 0]
X_test_rf = df_rf.iloc[split_rf:, 1:]
y_test_rf = df_rf.iloc[split_rf:, 0]

model_rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

model_rf.fit(X_train_rf, y_train_rf)
pred_rf = model_rf.predict(X_test_rf)


In [None]:
# Random Forest evaluation metrics
mae_rf = mean_absolute_error(y_test_rf, pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test_rf, pred_rf))
r2_rf = r2_score(y_test_rf, pred_rf)

print("üìä Random Forest Metrics:")
print(f"MAE: {mae_rf:.4f}")
print(f"RMSE: {rmse_rf:.4f}")
print(f"R2: {r2_rf:.4f}")


In [None]:
# Compare models
results = pd.DataFrame({
    "Model": ["LSTM", "Random Forest"],
    "MAE": [mae_lstm, mae_rf],
    "RMSE": [rmse_lstm, rmse_rf],
    "R2": [r2_lstm, r2_rf]
})
results


In [None]:
# Visualize predictions
plt.figure(figsize=(14,5))
plt.plot(y_actual_lstm, label="Actual")
plt.plot(pred_lstm, label="LSTM Predicted")
plt.title("LSTM Forecast")
plt.legend()
plt.show()

plt.figure(figsize=(14,5))
plt.plot(y_test_rf.values, label="Actual")
plt.plot(pred_rf, label="Random Forest Predicted")
plt.title("Random Forest Forecast")
plt.legend()
plt.show()


In [None]:
# Error analysis (histograms)
fig, axes = plt.subplots(1,2, figsize=(14,4))
axes[0].hist((y_actual_lstm.flatten() - pred_lstm.flatten()), bins=50, color='red', alpha=0.6)
axes[0].set_title('LSTM Errors')
axes[1].hist((y_test_rf.values - pred_rf), bins=50, color='green', alpha=0.6)
axes[1].set_title('Random Forest Errors')
plt.show()


In [None]:
# Future prediction for next N days
N_DAYS = 30  # change this to forecast more/less days

# LSTM iterative forecasting using last WINDOW days
last_window = scaled_data[-WINDOW:].reshape(1, WINDOW, 1).copy()
lstm_forecast_scaled = []
input_seq = last_window.copy()
for _ in range(N_DAYS):
    pred_scaled = model_lstm.predict(input_seq)[0,0]
    lstm_forecast_scaled.append(pred_scaled)
    # append and slide window
    new_seq = np.append(input_seq.flatten()[1:], pred_scaled).reshape(1, WINDOW, 1)
    input_seq = new_seq

lstm_forecast = scaler_lstm.inverse_transform(np.array(lstm_forecast_scaled).reshape(-1,1)).flatten()

# Random Forest future forecast using last available features
# We'll build features iteratively using predicted prices for lags and rolling stats (simple approach)
future_rf_preds = []
df_recent = df_rf.copy()
for i in range(N_DAYS):
    last_row = df_recent.iloc[-1].copy()
    # build feature vector for prediction (lags shift by 1, newest lag_1 is last close or last predicted)
    features = []
    # use the last close from original df or last predicted value
    current_close = df_recent['Close'].iloc[-1]
    # create new lag values: lag_1 becomes current_close, lag_2 becomes previous lag_1, ...
    for j in range(1, 61):
        if j == 1:
            features.append(current_close)
        else:
            features.append(df_recent[f'lag_{j-1}'].iloc[-1])
    # rolling stats (use last values; not updated for predicted series for simplicity)
    features.append(df_recent['roll_mean_7'].iloc[-1])
    features.append(df_recent['roll_std_7'].iloc[-1])
    features.append(df_recent['roll_mean_30'].iloc[-1])
    features.append(df_recent['roll_std_30'].iloc[-1])
    features = np.array(features).reshape(1,-1)
    pred = model_rf.predict(features)[0]
    future_rf_preds.append(pred)
    # append a new row to df_recent with predicted close and shift lags
    new_row = df_recent.iloc[-1].copy()
    new_row['Close'] = pred
    # update lags
    for j in range(60,0,-1):
        if j==1:
            new_row['lag_1'] = current_close
        else:
            new_row[f'lag_{j}'] = df_recent[f'lag_{j-1}'].iloc[-1]
    # update rolling stats simplistically by appending pred to a temp series
    temp_close_series = list(df_recent['Close'].values[-29:]) + [pred]
    new_row['roll_mean_7'] = np.mean(temp_close_series[-7:])
    new_row['roll_std_7'] = np.std(temp_close_series[-7:])
    new_row['roll_mean_30'] = np.mean(temp_close_series[-30:]) if len(temp_close_series)>=30 else np.mean(temp_close_series)
    new_row['roll_std_30'] = np.std(temp_close_series[-30:]) if len(temp_close_series)>=30 else np.std(temp_close_series)
    df_recent = df_recent.append(new_row, ignore_index=True)

# Prepare date index for forecast starting after last date in df
last_date = df.index[-1]
forecast_dates = pd.bdate_range(start=last_date + pd.Timedelta(days=1), periods=N_DAYS)

future_df = pd.DataFrame({
    'date': forecast_dates,
    'lstm_forecast': lstm_forecast,
    'rf_forecast': future_rf_preds
})
future_df.set_index('date', inplace=True)
future_df.head()


In [None]:
# Plot future forecasts along with last 200 days of actuals
plt.figure(figsize=(14,6))
plt.plot(df['Close'][-200:], label='Actual (last 200 days)')
plt.plot(future_df['lstm_forecast'], label='LSTM Forecast (next days)', linestyle='--')
plt.plot(future_df['rf_forecast'], label='RF Forecast (next days)', linestyle='--')
plt.title(f'Future Forecasts for {STOCK_TICKER}')
plt.legend()
plt.show()


In [None]:
# Save trained models to files (in Colab these will be in /content)
model_lstm.save('lstm_model.h5')

import pickle
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

print('Saved lstm_model.h5 and rf_model.pkl')


# üèÅ Project Complete

You have:
- Trained an LSTM model and a Random Forest model.
- Compared their performance.
- Generated a 30-day future forecast from both models (iterative LSTM and iterative RF approximation).
- Saved the trained models.

**To export this notebook to PDF**: use `File ‚Üí Print` and choose "Save as PDF" (best for Colab).
**To run locally**: download the `.ipynb` and run in Jupyter or Colab.

