In [1]:
from update_data_util import fetch_and_update_stock_data, update_predictions_file
fetch_and_update_stock_data('^NSEI')
print("-"*30)
update_predictions_file("NIFTY_50_data.csv", "NIFTY_50_predictions.csv")

📈 Updating existing data for NIFTY_50...
Last record in file is from: 2025-08-13 13:15:00+05:30
✅ Data updated from 2025-08-13 14:15:00+05:30 to 2025-08-13 15:15:00+05:30
📊 Added 2 new records
------------------------------
Appended 2 new entries to NIFTY_50_predictions.csv


In [2]:
import pandas as pd
from plot_data_util import interactive_data_plot
predictions_file_path = "NIFTY_50_predictions.csv"
data = pd.read_csv(predictions_file_path, parse_dates=['Datetime'], index_col='Datetime')
interactive_data_plot(ticker='NIFTY 50', data = data)


In [3]:
interactive_data_plot(ticker='NIFTY 50', data = data, n = 50)

In [4]:
from find_missing_predictions_util import find_missing_predictions
missing_rows = find_missing_predictions('NIFTY_50_predictions.csv')
print(missing_rows)

Found 2 rows with missing 'PredictedPrice' values.
(2,                       ActualPrice  PredictedPrice
Datetime                                         
2025-08-13 14:15:00  24614.750000             NaN
2025-08-13 15:15:00  24630.400391             NaN)


In [5]:
from create_lags_util import create_all_advanced_lags
import pandas as pd

predictions_file_path = "NIFTY_50_predictions.csv"

data = pd.read_csv(predictions_file_path, parse_dates=['Datetime'], index_col='Datetime')
if data.index.tz is not None:
    data.index = data.index.tz_localize(None)

data_df = data[['ActualPrice']].copy()

full_lagged = create_all_advanced_lags(data_df)
if full_lagged.index.tz is not None:
    full_lagged.index = full_lagged.index.tz_localize(None)

full_lagged.to_csv("NIFTY_50_lagged.csv")
full_lagged.head()

Unnamed: 0_level_0,ActualPrice,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,week_lag_1,...,week_lag_6,week_lag_7,month_lag_1,month_lag_2,month_lag_3,month_lag_4,month_lag_5,month_lag_6,month_lag_7,year_lag
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-08-08 10:15:00,19560.849609,19590.5,19597.949219,19598.099609,19585.699219,19584.550781,19535.25,19538.199219,19565.349609,19560.849609,...,19570.800781,19630.949219,19560.849609,19575.5,19579.550781,19576.599609,19566.099609,19570.800781,19630.949219,24186.349609
2023-08-08 11:15:00,19575.5,19560.849609,19590.5,19597.949219,19598.099609,19585.699219,19584.550781,19535.25,19538.199219,19560.849609,...,19570.800781,19630.949219,19560.849609,19575.5,19579.550781,19576.599609,19566.099609,19570.800781,19630.949219,24306.599609
2023-08-08 12:15:00,19579.550781,19575.5,19560.849609,19590.5,19597.949219,19598.099609,19585.699219,19584.550781,19535.25,19560.849609,...,19570.800781,19630.949219,19560.849609,19575.5,19579.550781,19576.599609,19566.099609,19570.800781,19630.949219,24160.349609
2023-08-08 13:15:00,19576.599609,19579.550781,19575.5,19560.849609,19590.5,19597.949219,19598.099609,19585.699219,19584.550781,19560.849609,...,19570.800781,19630.949219,19560.849609,19575.5,19579.550781,19576.599609,19566.099609,19570.800781,19630.949219,24154.599609
2023-08-08 14:15:00,19566.099609,19576.599609,19579.550781,19575.5,19560.849609,19590.5,19597.949219,19598.099609,19585.699219,19560.849609,...,19570.800781,19630.949219,19560.849609,19575.5,19579.550781,19576.599609,19566.099609,19570.800781,19630.949219,24115.199219


In [6]:

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
# Check for missing predictions to determine the starting point
num_missing, missing_rows_df = find_missing_predictions(predictions_file_path)

if num_missing == 0:
    print("All predictions are already up-to-date. No new predictions needed. ✅")
else:
    print(f"Found {num_missing} rows with missing predictions. Starting prediction process... 🤖")
    
    # Get the datetime of the first row with a missing prediction
    first_missing_datetime = missing_rows_df.index[0]
    
    # Get the datetime of the first row in the lagged data (after dropping NaNs)
    first_lagged_datetime = full_lagged.index[0]

    # The actual starting datetime must be the later of these two dates.
    # This guarantees the datetime exists in the lagged data.
    start_datetime = max(first_missing_datetime, first_lagged_datetime)

    # Now, find the integer index for this starting datetime in the lagged data.
    start_n = full_lagged.index.get_loc(start_datetime)

    print(f"Starting prediction from index: {start_n+10} for datetime: {start_datetime}")

    # The prediction loop now starts from the calculated index
    for i, n in enumerate(range(start_n, len(full_lagged)), 1):
        # Step 1: Prepare training data
        train_lagged = full_lagged.iloc[:n]
        X_train = train_lagged.drop(columns=['ActualPrice'])
        y_train = train_lagged['ActualPrice']

        # Step 2: Train the model
        model = XGBRegressor(n_estimators=650, 
                             random_state=42,
                             colsample_bytree = 0.8,
                             learning_rate= 0.25,
                             max_depth = 6)
        model.fit(X_train, y_train)
        # Step 3: Predict the next point
        test_point = full_lagged.iloc[n]
        X_test_point = test_point.drop('ActualPrice').to_frame().T
        y_pred = model.predict(X_test_point)[0]

        # Step 4: Store prediction
        pred_index = full_lagged.index[n]
        if pred_index in data.index:
            data.at[pred_index, 'PredictedPrice'] = y_pred

        # Step 5: Save every 20 steps
        if i % 20 == 0:
            # Determine the datetime range of the saved batch
            # `n` is the current index, so `n-19` to `n` is the batch of 20
            start_batch_datetime = full_lagged.index[n - 19]
            end_batch_datetime = full_lagged.index[n]
            
            data.to_csv(predictions_file_path, index=True)
            print(f"Saved predictions. {i} new predictions have been made so far, from {start_batch_datetime} to {end_batch_datetime}-(from {start_n-10+i} to {start_n+9+i}).")
            
    # Final save
    data.to_csv(predictions_file_path, index=True)
    print(f"✅ All {num_missing} missing predictions have been generated and saved.")
# ---- Predict the last point (without saving) ----
last_index = len(full_lagged) - 1  # index of the last row
train_lagged = full_lagged.iloc[:-1]  # all except the last row
X_train = train_lagged.drop(columns=['ActualPrice'])
y_train = train_lagged['ActualPrice']

# Train model on all available actual data
model = XGBRegressor(n_estimators=650, 
                             random_state=42,
                             colsample_bytree = 0.8,
                             learning_rate= 0.25,
                             max_depth = 6)
model.fit(X_train, y_train)

# Predict last point
last_point = full_lagged.iloc[-1].drop('ActualPrice').to_frame().T
last_pred = model.predict(last_point)[0]

print(f"🔮 Prediction for the last point ({full_lagged.index[-1]}): {last_pred}")

Found 2 rows with missing 'PredictedPrice' values.
Found 2 rows with missing predictions. Starting prediction process... 🤖
Starting prediction from index: 3486 for datetime: 2025-08-13 14:15:00
✅ All 2 missing predictions have been generated and saved.
🔮 Prediction for the last point (2025-08-13 15:15:00): 24626.078125


In [7]:
import pandas as pd
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    mean_squared_error
)

# Step 1: Read the predictions CSV
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=['Datetime'])
data.set_index("Datetime", inplace=True)

# Step 2: Drop rows where the predicted price is -inf
# Use a boolean mask to filter out rows where 'PredictedPrice' is negative infinity
eval_df = data[data['PredictedPrice'] != -np.inf]

# Step 3: Extract actual and predicted values
y_true = eval_df['ActualPrice'].values
y_pred = eval_df['PredictedPrice'].values

# Step 4: Calculate evaluation metrics
rmse = root_mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

# Step 5: Mean Error (ME) calculation
# The Mean Error is the average of the differences (residuals).
mean_error = np.mean(y_pred - y_true)

# Step 6: Adjusted R² calculation
n = len(y_true)
p = len(eval_df.columns) - 1 # Exclude ActualPrice
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Step 7: Print all metrics
print(f"➡️ MSE: {mse:.4f}")
print(f"✅ RMSE: {rmse:.4f}")
print(f"➡️ MAE: {mae:.4f}")
print(f"➡️ MAPE: {mape:.4%}")
print(f"✅ R² Score: {r2:.4f}")
print(f"➡️ Adjusted R²: {adjusted_r2:.4f}")
print(f"➡️ Mean Error: {mean_error:.4f}")

➡️ MSE: 4256.2077
✅ RMSE: 65.2396
➡️ MAE: 44.4067
➡️ MAPE: 0.1914%
✅ R² Score: 0.9984
➡️ Adjusted R²: 0.9984
➡️ Mean Error: -6.3948


In [8]:
from plot_data_util import plot_train_and_prediction_and_last_pred_point
plot_train_and_prediction_and_last_pred_point(data, "NIFTY 50", n=10, last_pred=last_pred)


In [9]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=35)

In [10]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=70)

In [11]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=100)

In [12]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50")