In [1]:
from update_data_util import fetch_and_update_stock_data, update_predictions_file
fetch_and_update_stock_data('^NSEI')
print("-"*30)
update_predictions_file("NIFTY_50_data.csv", "NIFTY_50_predictions.csv")

📈 Updating existing data for NIFTY_50...
Last record in file is from: 2025-08-13 15:15:00+05:30
✅ Data updated from 2025-08-14 09:15:00+05:30 to 2025-08-14 15:15:00+05:30
📊 Added 7 new records
------------------------------
Appended 7 new entries to NIFTY_50_predictions.csv


In [2]:
import pandas as pd
from plot_data_util import interactive_data_plot
predictions_file_path = "NIFTY_50_predictions.csv"
data = pd.read_csv(predictions_file_path, parse_dates=['Datetime'], index_col='Datetime')
interactive_data_plot(ticker='NIFTY 50', data = data)


In [3]:
interactive_data_plot(ticker='NIFTY 50', data = data, n = 50)

In [4]:
from find_missing_predictions_util import find_missing_predictions
missing_rows = find_missing_predictions('NIFTY_50_predictions.csv')
print(missing_rows)

Found 7 rows with missing 'PredictedPrice' values.
(7,                       ActualPrice  PredictedPrice
Datetime                                         
2025-08-14 09:15:00  24629.400391             NaN
2025-08-14 10:15:00  24646.500000             NaN
2025-08-14 11:15:00  24629.699219             NaN
2025-08-14 12:15:00  24652.300781             NaN
2025-08-14 13:15:00  24637.699219             NaN
2025-08-14 14:15:00  24630.300781             NaN
2025-08-14 15:15:00  24616.050781             NaN)


In [5]:
from lags_util import create_all_advanced_lags
import pandas as pd

predictions_file_path = "NIFTY_50_predictions.csv"

data = pd.read_csv(predictions_file_path, parse_dates=['Datetime'], index_col='Datetime')
if data.index.tz is not None:
    data.index = data.index.tz_localize(None)

data_df = data[['ActualPrice']].copy()

# full_lagged = create_all_advanced_lags(data_df)
# if full_lagged.index.tz is not None:
#     full_lagged.index = full_lagged.index.tz_localize(None)

# full_lagged.to_csv("NIFTY_50_lagged.csv")
# full_lagged.head()

In [6]:
from lags_util import update_all_advanced_lags
full_lagged = update_all_advanced_lags(data_df, "NIFTY_50_lagged.csv")

In [7]:

import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor

# Check for missing predictions to determine the starting point
num_missing, missing_rows_df = find_missing_predictions(predictions_file_path)

if num_missing == 0:
    print("All predictions are already up-to-date. No new predictions needed. ✅")
else:
    print(f"Found {num_missing} rows with missing predictions. Starting prediction process... 🤖")
    
    # Get the datetime of the first row with a missing prediction
    first_missing_datetime = missing_rows_df.index[0]
    
    # Get the datetime of the first row in the lagged data (after dropping NaNs)
    first_lagged_datetime = full_lagged.index[0]

    # The actual starting datetime must be the later of these two dates.
    # This guarantees the datetime exists in the lagged data.
    start_datetime = max(first_missing_datetime, first_lagged_datetime)

    # Now, find the integer index for this starting datetime in the lagged data.
    start_n = full_lagged.index.get_loc(start_datetime)

    print(f"Starting prediction from index: {start_n+10} for datetime: {start_datetime}")

    # The prediction loop now starts from the calculated index
    for i, n in enumerate(range(start_n, len(full_lagged)), 1):
        # Step 1: Prepare training data
        train_lagged = full_lagged.iloc[:n]
        X_train = train_lagged.drop(columns=['ActualPrice'])
        y_train = train_lagged['ActualPrice']

        # Step 2: Train the model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Step 3: Predict the next point
        test_point = full_lagged.iloc[n]
        X_test_point = test_point.drop('ActualPrice').to_frame().T
        y_pred = model.predict(X_test_point)[0]

        # Step 4: Store prediction
        pred_index = full_lagged.index[n]
        if pred_index in data.index:
            data.at[pred_index, 'PredictedPrice'] = y_pred

        # Step 5: Save every 20 steps
        if i % 20 == 0:
            # Determine the datetime range of the saved batch
            # `n` is the current index, so `n-19` to `n` is the batch of 20
            start_batch_datetime = full_lagged.index[n - 19]
            end_batch_datetime = full_lagged.index[n]
            
            data.to_csv(predictions_file_path, index=True)
            print(f"Saved predictions. {i} new predictions have been made so far, from {start_batch_datetime} to {end_batch_datetime}-(from {start_n-10+i} to {start_n+9+i}).")
            
    # Final save
    data.to_csv(predictions_file_path, index=True)
    print(f"✅ All {num_missing} missing predictions have been generated and saved.")
# ---- Predict the last point (without saving) ----
last_index = len(full_lagged) - 1  # index of the last row
train_lagged = full_lagged.iloc[:-1]  # all except the last row
X_train = train_lagged.drop(columns=['ActualPrice'])
y_train = train_lagged['ActualPrice']

# Train model on all available actual data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict last point
last_point = full_lagged.iloc[-1].drop('ActualPrice').to_frame().T
last_pred = model.predict(last_point)[0]

print(f"🔮 Prediction for the last point ({full_lagged.index[-1]}): {last_pred}")

Found 7 rows with missing 'PredictedPrice' values.
Found 7 rows with missing predictions. Starting prediction process... 🤖
Starting prediction from index: 3488 for datetime: 2025-08-14 09:15:00
✅ All 7 missing predictions have been generated and saved.
🔮 Prediction for the last point (2025-08-14 15:15:00): 24633.52296875


In [8]:
import pandas as pd
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    mean_squared_error
)

# Step 1: Read the predictions CSV
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=['Datetime'])
data.set_index("Datetime", inplace=True)

# Step 2: Drop rows where the predicted price is -inf
# Use a boolean mask to filter out rows where 'PredictedPrice' is negative infinity
eval_df = data[data['PredictedPrice'] != -np.inf]

# Step 3: Extract actual and predicted values
y_true = eval_df['ActualPrice'].values
y_pred = eval_df['PredictedPrice'].values

# Step 4: Calculate evaluation metrics
rmse = root_mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

# Step 5: Mean Error (ME) calculation
# The Mean Error is the average of the differences (residuals).
mean_error = np.mean(y_pred - y_true)

# Step 6: Adjusted R² calculation
n = len(y_true)
p = len(eval_df.columns) - 1 # Exclude ActualPrice
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Step 7: Print all metrics
print(f"➡️ MSE: {mse:.4f}")
print(f"✅ RMSE: {rmse:.4f}")
print(f"➡️ MAE: {mae:.4f}")
print(f"➡️ MAPE: {mape:.4%}")
print(f"✅ R² Score: {r2:.4f}")
print(f"➡️ Adjusted R²: {adjusted_r2:.4f}")
print(f"➡️ Mean Error: {mean_error:.4f}")

➡️ MSE: 3707.3376
✅ RMSE: 60.8879
➡️ MAE: 40.7526
➡️ MAPE: 0.1756%
✅ R² Score: 0.9986
➡️ Adjusted R²: 0.9986
➡️ Mean Error: -3.9620


In [9]:
from plot_data_util import plot_train_and_prediction_and_last_pred_point
plot_train_and_prediction_and_last_pred_point(data, "NIFTY 50", n=10, last_pred=last_pred)


In [16]:
from plot_data_util import plot_train_and_prediction_and_last_pred_point
plot_train_and_prediction_and_last_pred_point(data, "NIFTY 50", n=70, last_pred=last_pred)


In [10]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=35)

In [11]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=70)

In [12]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50", n=100)

In [13]:
from plot_data_util import plot_train_and_prediction
data = pd.read_csv("NIFTY_50_predictions.csv", parse_dates=True, index_col=0)
plot_train_and_prediction(data, ticker="NIFTY 50")