# **Environment Setup**

In [55]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from sklearn.preprocessing import MinMaxScaler
import optuna
from optuna.samplers import TPESampler


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



# **Step 1: Data Integration & Preprocessing**

In [57]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load all datasets
nifty = pd.read_csv("Dataset/combined_nifty50_data.csv", parse_dates=['Date'], index_col='Date')
gdp_interest = pd.read_excel("Dataset/GDP and Interest rate data.xlsx", sheet_name="Data", parse_dates=['Year'], index_col='Year')
climate = pd.read_excel('Dataset/Book1.xlsx', sheet_name='Sheet1', parse_dates=['DATE']).rename(columns={'DATE':'Date'}).set_index('Date')

# Resample GDP data to daily frequency (forward fill)
gdp_daily = gdp_interest.resample('D').ffill()

# Merge all datasets
merged_df = nifty.join([gdp_daily, climate], how='left').ffill()

# Feature Engineering
merged_df['Heatwave'] = np.where(merged_df['tempmax'] > 40, 1, 0)  # Extreme weather flag
merged_df['GDP_Lag7'] = merged_df['GDP growth (annual %)'].shift(7)  # 7-day lag

# Normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(merged_df[['Close', 'GDP growth (annual %)', 'temp', 'humidity']])
merged_df[['Close_Scaled', 'GDP_Scaled', 'Temp_Scaled', 'Humidity_Scaled']] = scaled_data

In [58]:
merged_df.head()

Unnamed: 0,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),Country Name,Country Code,Lending interest rate (%),GDP per capita growth (annual %),...,feelslikemin_humidity,feelslike_humidity,temp_range,heat_index,Heatwave,GDP_Lag7,Close_Scaled,GDP_Scaled,Temp_Scaled,Humidity_Scaled
2017-01-02,8210.1,8212.0,8133.8,8179.5,122016111.0,5255.49,India,IND,9.508333,5.561185,...,,,,,0,,0.030595,0.812882,,
2017-01-03,8196.05,8219.1,8148.6,8192.25,131186021.0,6053.67,India,IND,9.508333,5.561185,...,,,,,0,,0.031281,0.812882,,
2017-01-04,8202.65,8218.5,8180.9,8190.5,136476345.0,6543.13,India,IND,9.508333,5.561185,...,,,,,0,,0.031187,0.812882,,
2017-01-05,8226.65,8282.65,8223.7,8273.8,163957452.0,8180.68,India,IND,9.508333,5.561185,...,,,,,0,,0.035664,0.812882,,
2017-01-06,8281.85,8306.85,8233.25,8243.8,143689850.0,7298.74,India,IND,9.508333,5.561185,...,,,,,0,,0.034051,0.812882,,


# **Step 2: Handle Missing Data & Align Features**

In [46]:
import pandas as pd
import numpy as np

# Forward-fill economic data (same country values)
merged_df[['Country Name', 'Country Code']] = merged_df[['Country Name', 'Country Code']].ffill()

# Interpolate numerical features
merged_df['GDP_Lag7'] = merged_df['GDP growth (annual %)'].shift(7).interpolate(method='linear')
merged_df['temp'] = merged_df['temp'].interpolate(method='time')  # Time-aware interpolation

# Drop remaining NaNs
merged_df = merged_df.dropna(subset=['Close', 'GDP growth (annual %)', 'temp'])

In [47]:
merged_df.head()

Unnamed: 0,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),Country Name,Country Code,Lending interest rate (%),GDP per capita growth (annual %),...,feelslikemin_humidity,feelslike_humidity,temp_range,heat_index,Heatwave,GDP_Lag7,Close_Scaled,GDP_Scaled,Temp_Scaled,Humidity_Scaled
2017-01-13,8457.65,8461.05,8373.15,8400.35,190949616.0,9156.16,India,IND,9.508333,5.561185,...,378.04,828.78,13.8,302.32711,0,6.795383,0.042465,0.812882,0.151515,0.675214
2017-01-16,8390.95,8426.7,8374.4,8412.8,127938836.0,6043.67,India,IND,9.508333,5.561185,...,579.04,1022.72,8.8,286.145724,0,6.795383,0.043134,0.812882,0.221212,0.705739
2017-01-17,8415.05,8440.9,8378.3,8398.0,125781216.0,6389.21,India,IND,9.508333,5.561185,...,184.38,798.98,10.0,323.096619,0,6.795383,0.042339,0.812882,0.106061,0.859585
2017-01-18,8403.85,8460.3,8397.4,8417.0,168867039.0,7411.23,India,IND,9.508333,5.561185,...,406.5,918.69,14.0,309.645193,0,6.795383,0.04336,0.812882,0.148485,0.78022
2017-01-19,8418.4,8445.15,8404.05,8435.1,170956149.0,7324.14,India,IND,9.508333,5.561185,...,417.04,970.42,13.9,299.721759,0,6.795383,0.044333,0.812882,0.181818,0.766789


In [48]:
from sklearn.preprocessing import MinMaxScaler

# Select features for scaling
features_to_scale = ['Close', 'GDP growth (annual %)', 'temp', 'humidity']
scaler = MinMaxScaler()

# Fit scaler only on training data
train_size = int(len(merged_df) * 0.8)
scaler.fit(merged_df[features_to_scale].iloc[:train_size])

# Transform all data
merged_df[['Close_Scaled', 'GDP_Scaled', 'Temp_Scaled', 'Humidity_Scaled']] = \
    scaler.transform(merged_df[features_to_scale])

In [49]:
# Create meaningful lag features
merged_df['GDP_Impact_Lag'] = merged_df['GDP growth (annual %)'].shift(30)  # 1-month lag

# Volatility indicator
merged_df['Price_Volatility'] = merged_df['High'] - merged_df['Low']

# Economic-health composite score
merged_df['Eco_Score'] = (
    0.6 * merged_df['GDP growth (annual %)'] +
    0.4 * (1 / merged_df['Lending interest rate (%)'])
)

In [50]:
def create_sequences(data, target_col, n_steps):
    X, y = [], []
    for i in range(len(data)-n_steps):
        X.append(data[i:i+n_steps])
        y.append(data.iloc[i+n_steps][target_col])
    return np.array(X), np.array(y)

# Use these features
features = ['Close_Scaled', 'GDP_Scaled', 'Temp_Scaled', 'Humidity_Scaled', 'Heatwave']
n_steps = 30  # 1-month lookback

X, y = create_sequences(merged_df[features], 'Close_Scaled', n_steps)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, 
         input_shape=(n_steps, len(features))),
    Dropout(0.3),
    LSTM(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

# Train-test split
split = int(0.8 * len(X))
history = model.fit(
    X[:split], y[:split],
    epochs=100,
    batch_size=64,
    validation_data=(X[split:], y[split:]),
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/100



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - loss: 0.1523 - mae: 0.3043 - val_loss: 0.0564 - val_mae: 0.1943
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0134 - mae: 0.0959 - val_loss: 0.1243 - val_mae: 0.2533
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0037 - mae: 0.0462 - val_loss: 0.1025 - val_mae: 0.2460
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0023 - mae: 0.0364 - val_loss: 0.0240 - val_mae: 0.1074
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 0.0019 - mae: 0.0313 - val_loss: 0.0085 - val_mae: 0.0692
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 0.0019 - mae: 0.0320 - val_loss: 0.0154 - val_mae: 0.0874
Epoch 7/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 0.0016

In [52]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
train_pred = model.predict(X[:split])
test_pred = model.predict(X[split:])

# Inverse scaling
def inverse_scale(col_scaled, scaler, feature_list):
    dummy = pd.DataFrame(np.zeros((len(col_scaled), len(feature_list))),
                        columns=feature_list)
    dummy['Close'] = col_scaled  # Only replace target column
    return scaler.inverse_transform(dummy)[:, 0]

# Convert predictions
y_train_true = inverse_scale(y[:split], scaler, features_to_scale)
y_train_pred = inverse_scale(train_pred.squeeze(), scaler, features_to_scale)
y_test_true = inverse_scale(y[split:], scaler, features_to_scale) 
y_test_pred = inverse_scale(test_pred.squeeze(), scaler, features_to_scale)

# Calculate metrics
metrics = {
    'Train RMSE': np.sqrt(mean_squared_error(y_train_true, y_train_pred)),
    'Test RMSE': np.sqrt(mean_squared_error(y_test_true, y_test_pred)),
    'Train MAE': mean_absolute_error(y_train_true, y_train_pred),
    'Test MAE': mean_absolute_error(y_test_true, y_test_pred),
    'Train R²': r2_score(y_train_true, y_train_pred),
    'Test R²': r2_score(y_test_true, y_test_pred)
}

print(pd.DataFrame([metrics]))

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
   Train RMSE   Test RMSE   Train MAE    Test MAE  Train R²   Test R²
0  265.904563  504.030124  203.333911  409.888049  0.992807  0.943594


In [53]:
import plotly.graph_objects as go

# Create interactive plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Close'],
                    name='Actual Prices', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=merged_df.index[n_steps:][split:],
                    y=y_test_pred, name='Predicted Prices',
                    line=dict(color='red', dash='dot')))

# Add economic events
fig.add_vrect(x0="2020-02-19", x1="2020-06-01", 
             annotation_text="COVID Crash", fillcolor="red", opacity=0.1)
fig.add_vrect(x0="2021-07-01", x1="2021-10-30",
             annotation_text="GDP Recovery", fillcolor="green", opacity=0.1)

fig.update_layout(
    title='Nifty50 Actual vs Predicted Prices with Economic Events',
    xaxis_title='Date',
    yaxis_title='Price',
    hovermode="x unified"
)
fig.show()