## Importing Packages

In [None]:
!pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=50833ff5abc50b995594214211153881dec54c4bd97f832d1a2a1f965a9387ae
  Stored in directory: /root/.cache/pip/wheels/5c/a1/5f/c6b85a7d9452057be4ce68a8e45d77ba34234a6d46581777c6
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import ta  # technical analysis library
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from copy import deepcopy


from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

## Data Engineering

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

files_saving_path = '/content/drive/MyDrive/Companies Interview Projects/Kotak Mahindra Bank/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# data downloading

# Download last 10 years of data
df = yf.download("^NSEI", start="2007-01-01", end="2025-10-04", interval="1d", auto_adjust=True)

# If columns are multiindex, flatten them
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [col[0] if col[1]=='' else f"{col[0]}" for col in df.columns.values]

[*********************100%***********************]  1 of 1 completed


In [None]:
# Adding derived features

# ---- 1. Lag features ----
df['Return_1d'] = df['Close'].pct_change()

# ---- 2. Moving Averages ----
df['SMA_5'] = df['Close'].rolling(5).mean()
df['SMA_20'] = df['Close'].rolling(20).mean()

# ---- 3. Volatility ----
df['Rolling_STD_10'] = df['Return_1d'].rolling(10).std()

# ---- 4. Momentum Indicators (RSI, MACD, Stochastic) ----
df['RSI_14'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()

# SMA differences to reduce VIF
df['Close_SMA5_diff'] = df['Close'] - df['SMA_5']
df['Close_SMA20_diff'] = df['Close'] - df['SMA_20']

In [None]:
# dropping volume column
df.drop(["Volume","Open","Low","High"],axis=1,inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
Close,0
Return_1d,1
SMA_5,4
SMA_20,19
Rolling_STD_10,10
RSI_14,13
Close_SMA5_diff,4
Close_SMA20_diff,19


In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

Unnamed: 0,0
Close,0
Return_1d,0
SMA_5,0
SMA_20,0
Rolling_STD_10,0
RSI_14,0
Close_SMA5_diff,0
Close_SMA20_diff,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4408 entries, 2007-10-15 to 2025-10-03
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Close             4408 non-null   float64
 1   Return_1d         4408 non-null   float64
 2   SMA_5             4408 non-null   float64
 3   SMA_20            4408 non-null   float64
 4   Rolling_STD_10    4408 non-null   float64
 5   RSI_14            4408 non-null   float64
 6   Close_SMA5_diff   4408 non-null   float64
 7   Close_SMA20_diff  4408 non-null   float64
dtypes: float64(8)
memory usage: 309.9 KB


In [None]:
# separating x and y
x = df[["Return_1d","Rolling_STD_10","RSI_14","Close_SMA5_diff","Close_SMA20_diff"]]
y = df["Close"]

In [None]:
print(len(df))

4408


In [None]:
df.to_csv(files_saving_path +'out.csv')

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

print(vif_data)

            Feature       VIF
0         Return_1d  1.619371
1    Rolling_STD_10  3.110062
2            RSI_14  3.469992
3   Close_SMA5_diff  2.656841
4  Close_SMA20_diff  2.469909


In [None]:
# train-test split

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

# TimeSeries split
tscv = TimeSeriesSplit(n_splits=5)

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Hyperparameter grid
param_grid = {'ridge__alpha':[0.01, 0.1, 1, 10, 100]}

# Grid search
grid = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='neg_mean_squared_error')
grid.fit(x_train, y_train)

In [None]:
print("Best alpha:", grid.best_params_)
print("Best score:", -grid.best_score_)

Best alpha: {'ridge__alpha': 100}
Best score: 33263324.04594469


In [None]:
# Evaluate on test set
from sklearn.metrics import mean_squared_error, r2_score

y_pred = grid.predict(x_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))
print("Test R2:", r2_score(y_test, y_pred))

Test MSE: 32620205.16257202
Test R2: 0.13684457378943427


In [None]:
# save the model
import joblib

joblib.dump(grid.best_estimator_, files_saving_path + 'nifty_ridge_model.pkl')
print("Model saved as nifty_ridge_model.pkl")

Model saved as nifty_ridge_model.pkl


In [None]:
import pandas as pd
import numpy as np
import joblib
import ta  # technical analysis library

def predict_next_close(model_path):
    """
    Predict next day's NIFTY Close.

    Parameters:
        last_ohlc_df: pd.DataFrame with columns ['Open', 'High', 'Low', 'Close', 'Volume']
                      Should contain at least 20 rows (most recent last row = today)
        model_path: path to saved Ridge regression pipeline

    Returns:
        predicted next day Close (float)
    """
    # Fetch last 20 trading days
    df = yf.download("^NSEI", period="25d")  # fetch a few extra days in case of weekends/holidays
    # If columns are multiindex, flatten them
    if isinstance(df.columns, pd.MultiIndex):
      df.columns = [col[0] if col[1]=='' else f"{col[0]}" for col in df.columns.values]

    # Keep only the last 20 rows
    df = df.tail(20).copy()

    # ---- 1. Lag & Returns ----
    df['Return_1d'] = df['Close'].pct_change()
    df['Lag_1'] = df['Close'].shift(1)

    # ---- 2. Moving Averages ----
    df['SMA_5'] = df['Close'].rolling(5).mean()
    df['SMA_20'] = df['Close'].rolling(20).mean()

    # ---- 3. Volatility ----
    df['Rolling_STD_10'] = df['Return_1d'].rolling(10).std()

    # ---- 4. Momentum Indicators ----
    df['RSI_14'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()

    # ---- 5. SMA Differences ----
    df['Close_SMA5_diff'] = df['Close'] - df['SMA_5']
    df['Close_SMA20_diff'] = df['Close'] - df['SMA_20']

    print(df.tail(1))

    # ---- 6. Select last row (most recent day) ----
    features = df[['Return_1d','Rolling_STD_10','RSI_14','Close_SMA5_diff','Close_SMA20_diff']].iloc[-1:]

    print(features.head())

    # ---- 7. Load saved pipeline ----
    model = joblib.load(model_path)

    # ---- 8. Predict next day Close ----
    next_close_pred = model.predict(features)

    return next_close_pred[0]

In [None]:
model_path = files_saving_path + 'nifty_ridge_model.pkl'
print(predict_next_close(model_path))

[*********************100%***********************]  1 of 1 completed

               Close          High           Low          Open  Volume  \
Date                                                                     
2025-10-03  24894.25  24904.800781  24747.550781  24759.550781  365900   

            Return_1d         Lag_1     SMA_5        SMA_20  Rolling_STD_10  \
Date                                                                          
2025-10-03   0.002333  24836.300781  24726.25  24990.769922        0.005179   

               RSI_14  Close_SMA5_diff  Close_SMA20_diff  
Date                                                      
2025-10-03  49.285955            168.0        -96.519922  
            Return_1d  Rolling_STD_10     RSI_14  Close_SMA5_diff  \
Date                                                                
2025-10-03   0.002333        0.005179  49.285955            168.0   

            Close_SMA20_diff  
Date                          
2025-10-03        -96.519922  
12549.351071366078



