In [None]:
import numpy as np
from xgboost import XGBRegressor
import pandas as pd
import os


ticker_to_basename = {
    # Indices
    "^NSEI": "NIFTY_50",
    "^NSEBANK": "NIFTY_BANK",
    "^CNXIT": "NIFTY_IT",
    "^CNXPHARMA": "NIFTY_PHARMA",
    "^CNXFMCG": "NIFTY_FMCG",
    "^CNXAUTO": "NIFTY_AUTO",
    "^CNXMETAL": "NIFTY_METAL",
    "^CNXREALTY": "NIFTY_REALTY",
    "^CNXENERGY": "NIFTY_ENERGY",
    "NIFTY_FIN_SERVICE.NS": "NIFTY_FIN_SERVICE",
    
    # Stocks
    "RELIANCE.NS": "RELIANCE_INDUSTRIES_LTD",
    "TCS.NS": "TATA_CONSULTANCY_SERV_LT", # Matched from your image
    "SUNPHARMA.NS": "SUN_PHARMACEUTICAL_IND_L",
    "ICICIBANK.NS": "ICICI_BANK_LTD.",
    "INFY.NS": "INFOSYS_LIMITED",
    "SBIN.NS": "STATE_BANK_OF_INDIA",
    "BHARTIARTL.NS": "BHARTI_AIRTEL_LIMITED",
    "ITC.NS": "ITC_LTD",
    "LT.NS": "LARSEN_&_TOUBRO_LTD.",
    "HINDUNILVR.NS": "HINDUSTAN_UNILEVER_LTD."
}

all_tickers = list(ticker_to_basename.keys())


raw_data_folder = r"X:\AGDrive\ADA-Timeseries project\Raw_Data"
predicted_data_folder = r"X:\AGDrive\ADA-Timeseries project\Pred_Data"
lagging_data_folder = r"X:\AGDrive\ADA-Timeseries project\Lag_Data"

In [3]:
equity_ticker_symbol = "^NSEI"
equity = ticker_to_basename[equity_ticker_symbol]
equity

'NIFTY_50'

In [4]:
equity = ticker_to_basename[equity_ticker_symbol]

raw_data_file = os.path.join(raw_data_folder, f"{equity}_data.csv")
predicted_data_file = os.path.join(predicted_data_folder, f"{equity}_predictions_xgboost.csv")
lagging_data_file = os.path.join(lagging_data_folder, f"{equity}_lagged.csv")

raw_data = pd.read_csv(raw_data_file,parse_dates=['Datetime'], index_col='Datetime')
predicted_data = pd.read_csv(predicted_data_file, parse_dates=['Datetime'], index_col='Datetime')
lagging_data = pd.read_csv(lagging_data_file, parse_dates=['Datetime'], index_col='Datetime')

# --- Print to verify ---
print(f"Raw Data File:     {raw_data_file}")
print(f"Predicted File:    {predicted_data_file}")
print(f"Lagging Data File: {lagging_data_file}")

Raw Data File:     X:\AGDrive\ADA-Timeseries project\Raw_Data\NIFTY_50_data.csv
Predicted File:    X:\AGDrive\ADA-Timeseries project\Pred_Data\NIFTY_50_predictions_xgboost.csv
Lagging Data File: X:\AGDrive\ADA-Timeseries project\Lag_Data\NIFTY_50_lagged.csv


In [5]:
from find_missing_predictions_util import find_missing_predictions
missing_rows = find_missing_predictions(predicted_data_file)
print(missing_rows)

No missing 'PredictedPrice' values found in the file.
(0, Empty DataFrame
Columns: []
Index: [])


In [6]:
# Check for missing predictions to determine the starting point
num_missing, missing_rows_df = find_missing_predictions(predicted_data_file)

if num_missing == 0:
    print("All predictions are already up-to-date. No new predictions needed. âœ…")
else:
    print(f"Found {num_missing} rows with missing predictions. Starting prediction process... ðŸ¤–")
    
    # Get the datetime of the first row with a missing prediction
    first_missing_datetime = missing_rows_df.index[0]
    
    # Get the datetime of the first row in the lagged data (after dropping NaNs)
    first_lagged_datetime = lagging_data.index[0]

    # Ensure both indices are datetime
    if not isinstance(first_missing_datetime, pd.Timestamp):
        first_missing_datetime = pd.to_datetime(first_missing_datetime)
    if not isinstance(first_lagged_datetime, pd.Timestamp):
        first_lagged_datetime = pd.to_datetime(first_lagged_datetime)
    start_datetime = max(first_missing_datetime, first_lagged_datetime)

    # find the integer index for this starting datetime in the lagged data.
    start_n = lagging_data.index.get_loc(start_datetime)

    print(f"Starting prediction from index: {start_n+10} for datetime: {start_datetime}")

    # The prediction loop now starts from the calculated index
    for i, n in enumerate(range(start_n, len(lagging_data)), 1):
        # Step 1: Prepare training data (use only latest 2000 points if available)
        if n > 2000:
            train_lagged = lagging_data.iloc[n-2000:n]
        else:
            train_lagged = lagging_data.iloc[:n]
            
        X_train = train_lagged.drop(columns=['ActualPrice'])
        y_train = train_lagged['ActualPrice']

        # Step 2: Train the model
        model = XGBRegressor(
            n_estimators=650, 
            random_state=42,
            colsample_bytree=0.8,
            learning_rate=0.06,
            max_depth=6,
            n_jobs = -1
        )
        model.fit(X_train, y_train)

        # Step 3: Predict the next point
        test_point = lagging_data.iloc[n]
        X_test_point = test_point.drop('ActualPrice').to_frame().T
        y_pred = model.predict(X_test_point)[0]

        # Step 4: Store prediction
        pred_index = lagging_data.index[n]
        if pred_index in predicted_data.index:
            predicted_data.at[pred_index, 'PredictedPrice'] = y_pred

        # Step 5: Save every 20 steps
        if i % 20 == 0:
            # Determine the datetime range of the saved batch
            start_batch_datetime = lagging_data.index[n - 19]
            end_batch_datetime = lagging_data.index[n]
            
            predicted_data.to_csv(predicted_data_file, index=True)
            print(f"Saved predictions. {i} new predictions have been made so far, "
                  f"from {start_batch_datetime} to {end_batch_datetime}-(from {start_n-10+i} to {start_n+9+i}).")
            
    # Final save
    predicted_data.to_csv(predicted_data_file, index=True)
    print(f"âœ… All {num_missing} missing predictions have been generated and saved.")

# ---- Predict the last point (without saving) ----
last_index = len(lagging_data) - 1  # index of the last row
# Use only last 2000 rows if available
train_lagged = lagging_data.iloc[-2000:-1] if len(lagging_data) > 2000 else lagging_data.iloc[:-1]
X_train = train_lagged.drop(columns=['ActualPrice'])
y_train = train_lagged['ActualPrice']

model = XGBRegressor(
    n_estimators=650, 
    random_state=42,
    colsample_bytree=0.8,
    learning_rate=0.06,
    max_depth=6,
    n_jobs = -1
)
model.fit(X_train, y_train)


# Predict last point
last_point = lagging_data.iloc[-1].drop('ActualPrice').to_frame().T
last_pred = model.predict(last_point)[0]

print(f"ðŸ”® Prediction for the last point ({lagging_data.index[-1]}): {last_pred}")

No missing 'PredictedPrice' values found in the file.
All predictions are already up-to-date. No new predictions needed. âœ…
ðŸ”® Prediction for the last point (2025-11-12 10:15:00): 25819.91796875


In [8]:
from plot_data_util import plot_train_and_prediction_and_last_pred_point
plot_train_and_prediction_and_last_pred_point(predicted_data, equity, n=35, last_pred=last_pred)