# **1. Import Libraries**

In [63]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
from optuna.samplers import TPESampler
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

# **2. Data Preparation**

In [64]:
def load_data():
    # Load and merge datasets
    nifty = pd.read_csv("Dataset/combined_nifty50_data.csv", 
                       parse_dates=['Date'], index_col='Date').sort_index()
    
    # Handle missing values
    nifty['Shares Traded'].ffill(inplace=True)
    nifty['Turnover (₹ Cr)'].ffill(inplace=True)
    
    # Technical indicators
    nifty['Daily_Return'] = nifty['Close'].pct_change() * 100
    nifty['Volatility'] = nifty['High'] - nifty['Low']
    nifty['MA_50'] = nifty['Close'].rolling(50).mean()
    nifty['MA_200'] = nifty['Close'].rolling(200).mean()

    # Economic data
    gdp = pd.read_excel("Dataset/GDP and Interest rate data.xlsx",
                       sheet_name="Data", parse_dates=['Year'], index_col='Year')
    gdp_daily = gdp.resample('D').ffill().add_prefix('Eco_')

    # Climate data
    climate = pd.read_excel("Dataset/Book1.xlsx", sheet_name='Sheet1',
                           parse_dates=['DATE']).rename(columns={'DATE':'Date'}).set_index('Date')
    climate['Heatwave'] = np.where(climate['tempmax'] > 40, 1, 0)
    climate['Temp_Range'] = climate['tempmax'] - climate['tempmin']

    # Merge datasets
    merged_df = nifty.merge(gdp_daily, left_index=True, right_index=True, how='left')
    merged_df = merged_df.merge(climate[['temp', 'humidity', 'Heatwave']], 
                               left_index=True, right_index=True, how='left')
    
    # Feature engineering
    merged_df['GDP_Lag7'] = merged_df['Eco_GDP growth (annual %)'].shift(7)
    merged_df['GDP_Lag30'] = merged_df['Eco_GDP growth (annual %)'].shift(30)
    
    final_features = [
        'Close',  # Target variable
        'Eco_GDP growth (annual %)', 'Eco_Lending interest rate (%)',  # Economic
        'temp', 'humidity', 'Heatwave',  # Climate
        'GDP_Lag7', 'GDP_Lag30'  # Engineered
    ]
    
    return merged_df[final_features].dropna()

In [65]:
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data)-n_steps):
        X.append(data[i:i+n_steps, 1:])  # All features except Close
        y.append(data[i+n_steps, 0])     # Close price
    return np.array(X), np.array(y)

# Load and prepare data
merged_df = load_data()

# Separate scalers
target_scaler = MinMaxScaler()  # For Close prices
feature_scaler = MinMaxScaler() # For other features

# Fit on training data only
train_size = int(0.8 * len(merged_df))
train_data = merged_df.iloc[:train_size]

# Scale target (Close prices)
target_scaler.fit(train_data[['Close']])

# Scale features
feature_cols = merged_df.columns.drop('Close')
feature_scaler.fit(train_data[feature_cols])

# Transform entire dataset
scaled_target = target_scaler.transform(merged_df[['Close']])
scaled_features = feature_scaler.transform(merged_df[feature_cols])

# Combine for sequence creation
full_scaled = np.hstack([scaled_target, scaled_features])

# Create sequences
n_steps = 30
X, y = create_sequences(full_scaled, n_steps)

In [66]:
print(f"\nShape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


Shape of X: (1591, 30, 7)
Shape of y: (1591,)


# **3. Model Training**

In [68]:
def build_model(params):
    model = Sequential([
        LSTM(params['lstm_units1'], activation='tanh',
             kernel_regularizer=l1_l2(0, params['l2_reg']),
             input_shape=(n_steps, X.shape[2]),
             return_sequences=True),
        Dropout(params['dropout_rate']),
        BatchNormalization(),
        LSTM(params['lstm_units2'], activation='tanh',
             kernel_regularizer=l1_l2(0, params['l2_reg'])),
        Dropout(params['dropout_rate']/2),
        Dense(1)
    ])
    model.compile(
        optimizer=Adam(learning_rate=params['learning_rate']),
        loss='mse',
        metrics=['mae']
    )
    return model

# Hyperparameter optimization with Optuna
def objective(trial):
    params = {
        'lstm_units1': trial.suggest_int('lstm_units1', 64, 128),
        'lstm_units2': trial.suggest_int('lstm_units2', 32, 64),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.3, 0.6),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-3),
        'l2_reg': trial.suggest_float('l2_reg', 1e-6, 1e-3)
    }
    
    model = build_model(params)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=64,
        verbose=0,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)]
    )
    return history.history['val_loss'][-1]

# Train/val split
split = int(0.8 * len(X))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]

# Run optimization
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

# Build final model
best_params = study.best_params
final_model = build_model(best_params)

# Train with early stopping
history = final_model.fit(
    X_train, y_train,
    epochs=200,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)]
)

[I 2025-04-29 16:53:21,819] A new study created in memory with name: no-name-d12caf57-839e-47cd-9b6f-f02f6d4cac19
[I 2025-04-29 16:53:50,569] Trial 0 finished with value: 0.03168093040585518 and parameters: {'lstm_units1': 88, 'lstm_units2': 63, 'dropout_rate': 0.5195981825434215, 'learning_rate': 0.0006387926357773329, 'l2_reg': 0.0001568626218019941}. Best is trial 0 with value: 0.03168093040585518.
[I 2025-04-29 16:54:30,279] Trial 1 finished with value: 0.025857338681817055 and parameters: {'lstm_units1': 74, 'lstm_units2': 33, 'dropout_rate': 0.5598528437324806, 'learning_rate': 0.000641003510568888, 'l2_reg': 0.0007083645052182496}. Best is trial 1 with value: 0.025857338681817055.
[I 2025-04-29 16:55:07,481] Trial 2 finished with value: 0.025911888107657433 and parameters: {'lstm_units1': 65, 'lstm_units2': 64, 'dropout_rate': 0.5497327922401265, 'learning_rate': 0.00029110519961044856, 'l2_reg': 0.00018264314223989355}. Best is trial 1 with value: 0.025857338681817055.
[I 2025-

Epoch 1/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 0.2448 - mae: 0.3039 - val_loss: 0.6770 - val_mae: 0.7584
Epoch 2/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.1456 - mae: 0.1842 - val_loss: 0.6364 - val_mae: 0.7344
Epoch 3/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.1313 - mae: 0.1683 - val_loss: 0.7064 - val_mae: 0.7834
Epoch 4/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.1091 - mae: 0.1325 - val_loss: 0.5687 - val_mae: 0.6936
Epoch 5/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.1027 - mae: 0.1297 - val_loss: 0.5963 - val_mae: 0.7165
Epoch 6/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 0.0932 - mae: 0.1149 - val_loss: 0.4915 - val_mae: 0.6428
Epoch 7/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - 

# **4. Evaluation and Visualization**

In [69]:
# Monte Carlo Dropout predictions
def mc_dropout_predict(model, X, n_samples=50):
    return np.array([model.predict(X, verbose=0) for _ in range(n_samples)])

mc_preds = mc_dropout_predict(final_model, X_val)
mean_preds = np.mean(mc_preds, axis=0).squeeze()
std_preds = np.std(mc_preds, axis=0).squeeze()

# Inverse scaling
def inverse_transform(scaled_values, scaler):
    return scaler.inverse_transform(scaled_values.reshape(-1, 1)).flatten()

# Convert predictions
y_true = inverse_transform(y_val, target_scaler)
y_pred = inverse_transform(mean_preds, target_scaler)
pred_std = inverse_transform(std_preds, target_scaler)  # For uncertainty bands

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"Real-World RMSE: {rmse:.2f}")
print(f"Real-World MAE: {mae:.2f}")

Real-World RMSE: 1264.87
Real-World MAE: 1018.85


In [81]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Create a sample DataFrame (replace with your actual data)
dates = pd.to_datetime(pd.date_range(start="2017-01-01", end="2024-12-31", freq='D'))
y_true = np.random.rand(len(dates)) * 100 + 100 + np.cumsum(np.random.randn(len(dates)) * 5) # Simulate prices
y_pred = y_true + np.random.randn(len(dates)) * 10 # Simulate predictions
pred_std = np.random.rand(len(dates)) * 5 # Simulate standard deviation of predictions
merged_df = pd.DataFrame({'Actual': y_true, 'Predicted': y_pred, 'pred_std': pred_std}, index=dates)
merged_df.index.name = 'Date'


# Create interactive plot
fig = go.Figure()

# Actual values
fig.add_trace(go.Scatter(
    x=merged_df.index[-len(y_true):],
    y=y_true[-len(y_true):], # Use only the last part of y_true
    name='Actual Prices',
    line=dict(color='blue'))
)

# Predicted values
fig.add_trace(go.Scatter(
    x=merged_df.index[-len(y_pred):],
    y=y_pred[-len(y_pred):], # Use only the last part of y_pred
    name='Predicted Prices',
    line=dict(color='red', dash='dot'))
)

# Uncertainty band
fig.add_trace(go.Scatter(
    x=merged_df.index[-len(y_pred):],
    y=y_pred[-len(y_pred):] + 1.96 * pred_std[-len(y_pred):], # Use only the last part of pred_std
    line=dict(width=0),
    showlegend=False)
)

fig.add_trace(go.Scatter(
    x=merged_df.index[-len(y_pred):],
    y=y_pred[-len(y_pred):] - 1.96 * pred_std[-len(y_pred):], # Use only the last part of pred_std
    fill='tonexty',
    line=dict(width=0),
    name='95% Confidence Interval')
)

# Add event markers
events = {
    'COVID-19 Crash': ('2020-03-01', '2020-06-01'),
    'GDP Recovery': ('2021-07-01', '2021-09-01')
}

for name, (start, end) in events.items():
    fig.add_vrect(
        x0=start, x1=end,
        fillcolor="red" if "Crash" in name else "green",
        opacity=0.2,
        annotation_text=name,
        annotation_position="top left"
    )

fig.update_layout(
    title='Nifty50: Actual vs Predicted Prices with Uncertainty',
    xaxis_title='Date',
    yaxis_title='Price (₹)',
    hovermode="x unified",
    template="plotly_white"
)

fig.show()


In [79]:
import plotly.graph_objects as go

# Get test set indices
n_steps = 30  # Same as used in sequence creation
split = int(0.8 * len(X))  # Training split index
test_start_idx = split + n_steps  # First test set index in original dataframe

# Get test dates from original dataframe
test_dates = merged_df.index[test_start_idx:test_start_idx + len(y_val)]

# Get test predictions and uncertainty (ensure these are for TEST SET ONLY)
mc_preds = mc_dropout_predict(final_model, X_val)  # X_val should be test set
mean_preds = np.mean(mc_preds, axis=0).squeeze()
std_preds = np.std(mc_preds, axis=0).squeeze()

# Inverse transform using target scaler
y_true_raw = target_scaler.inverse_transform(y_val.reshape(-1, 1)).flatten()
y_pred_raw = target_scaler.inverse_transform(mean_preds.reshape(-1, 1)).flatten()
pred_std = target_scaler.inverse_transform(std_preds.reshape(-1, 1)).flatten()

# Create interactive plot
fig = go.Figure()

# Actual values (test set only)
fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_true_raw,
    name='Actual Prices',
    line=dict(color='blue'),
    hovertemplate="Date: %{x|%b %d, %Y}<br>Actual: ₹%{y:.2f}<extra></extra>"
))

# Predicted values (test set only)
fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_pred_raw,
    name='Predicted Prices',
    line=dict(color='#FF4B4B', dash='dot'),
    hovertemplate="Date: %{x|%b %d, %Y}<br>Predicted: ₹%{y:.2f}<extra></extra>"
))

# Uncertainty band (95% CI)
fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_pred_raw + 1.96*pred_std,
    line=dict(width=0),
    showlegend=False,
    hoverinfo='skip'
))

fig.add_trace(go.Scatter(
    x=test_dates,
    y=y_pred_raw - 1.96*pred_std,
    fill='tonexty',
    line=dict(width=0),
    fillcolor='rgba(255, 75, 75, 0.2)',
    name='95% Confidence Interval',
    hoverinfo='skip'
))

# Update layout
fig.update_layout(
    title='Nifty50 Test Set Predictions: Actual vs Forecast',
    xaxis_title='Trading Date',
    yaxis_title='Closing Price (₹)',
    hovermode="x unified",
    template="plotly_white",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(t=100),
    height=600
)

# Add range selector
fig.update_xaxes(
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(step="all")
        ])
    )
)

fig.show()