### Importing the necessary documents

In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from datetime import datetime
import yfinance as yf

: 

#### Date from which the data is collected

In [2]:
START_DATE = "2015-01-01"
END_DATE = "2026-12-31"

In [3]:
NIFTY_TICKER = "^NSEI"
VIX_TICKER = "INDIAVIX"

In [4]:
SAVE_DIR = "E:\\fourth_sem\\nifty_ml_hybrid\\datasets\\raw"

In [None]:
from ctypes.wintypes import MAX_PATH


def download_data(ticker, start, end):
    data = yf.download(
        ticker,
        start=start,
        end=end,
        interval="1d",
        auto_adjust=False,
        progress=False
    )
    return data


# -----------------------------
# Clean Data
# -----------------------------
def clean_data(df):
    df = df.copy()
    df.reset_index(inplace=True)
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # Convert date column
    df['date'] = pd.to_datetime(df['date'])

    # Sort by date
    df = df.sort_values("date")

    # Remove duplicates
    df = df.drop_duplicates(subset="date")

    return df


# -----------------------------
# Main Pipeline
# -----------------------------
def main():
    print("Downloading NIFTY-50 data...")
    nifty = download_data(NIFTY_TICKER, START_DATE, END_DATE)
    nifty = clean_data(nifty)

    print("Downloading India VIX data...")
    vix = download_data(VIX_TICKER, START_DATE, END_DATE)
    vix = clean_data(vix)

    # Keep only close price of VIX
    vix = vix[['date', 'close']]
    vix.rename(columns={'close': 'india_vix'}, inplace=True)

    # Merge datasets
    merged = pd.merge(nifty, vix, on="date", how="left")

    # Forward fill VIX if missing
    merged['india_vix'] = merged['india_vix'].fillna(method='ffill')

    # Final check
    print("Final Dataset Info:")
    print(merged.info())

    # Save dataset
    merged.to_csv(MAX_PATH, index=False)
    print(f"\nData saved at: {'E:\fourth_sem\nifty_ml_hybrid\datasets\raw'}")


if __name__ == "__main__":
    main()