In [1]:
import sys
print(sys.executable)

/opt/homebrew/Cellar/jupyterlab/4.3.6/libexec/bin/python


In [2]:
%pip install yfinance pandas numpy scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.6/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import yfinance as yf
import pandas as pd
import os

In [4]:
if os.path.exists("nifty50.csv"):
    nifty50 = pd.read_csv("nifty50.csv", index_col=0)
else:
    nifty50 = yf.Ticker("^NSEI")
    nifty50 = nifty50.history(period="max")
    nifty50.to_csv("nifty50.csv")
print(f"Initial rows: {nifty50.shape[0]}")

Initial rows: 4294


In [5]:
nifty50.index = pd.to_datetime(nifty50.index)
del nifty50["Dividends"]
del nifty50["Stock Splits"]

In [6]:
nifty50["Tomorrow"] = nifty50["Close"].shift(-1)
nifty50["Target"] = (nifty50["Tomorrow"] > nifty50["Close"]).astype(int)

In [7]:
nifty50


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-09-17 00:00:00+05:30,4518.450195,4549.049805,4482.850098,4494.649902,0,4546.200195,1
2007-09-18 00:00:00+05:30,4494.100098,4551.799805,4481.549805,4546.200195,0,4732.350098,1
2007-09-19 00:00:00+05:30,4550.250000,4739.000000,4550.250000,4732.350098,0,4747.549805,1
2007-09-20 00:00:00+05:30,4734.850098,4760.850098,4721.149902,4747.549805,0,4837.549805,1
2007-09-21 00:00:00+05:30,4752.950195,4855.700195,4733.700195,4837.549805,0,4932.200195,1
...,...,...,...,...,...,...,...
2025-03-17 00:00:00+05:30,22353.150391,22577.000000,22353.150391,22508.750000,251100,22834.300781,1
2025-03-18 00:00:00+05:30,22662.250000,22857.800781,22599.199219,22834.300781,272600,22907.599609,1
2025-03-19 00:00:00+05:30,22874.949219,22940.699219,22807.949219,22907.599609,324000,23190.650391,1
2025-03-20 00:00:00+05:30,23036.599609,23216.699219,22973.949219,23190.650391,313700,23350.400391,1


In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
train = nifty50.iloc[:-100]
test = nifty50.iloc[-100:]
predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [9]:
from sklearn.metrics import precision_score
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index, name="Predictions")
combined = pd.concat([test["Target"], preds], axis=1)

In [10]:
precision_score(test["Target"], preds)

0.3924050632911392

In [11]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined
    

In [12]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [13]:
predictions = backtest(nifty50, model, predictors)

In [14]:
predictions["Predictions"].value_counts()

Predictions
1    1030
0     764
Name: count, dtype: int64

In [15]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5339805825242718

In [16]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1    0.544036
0    0.455964
Name: count, dtype: float64

In [17]:
horizons = [2, 5, 60, 250, 500]
new_predictors = []
for horizon in horizons:
    rolling_averages = nifty50.rolling(horizon).mean()
    ratio_column = f"Close_Ratio_{horizon}"
    nifty50.loc[:, ratio_column] = nifty50["Close"] / rolling_averages["Close"]
    trend_column = f"Trend_{horizon}"
    nifty50.loc[:, trend_column] = nifty50.shift(1).rolling(horizon).sum()["Target"]
    new_predictors += [ratio_column, trend_column]
print(f"Rows after adding predictors: {nifty50.shape[0]}")

Rows after adding predictors: 4294


In [18]:
essential_columns = ["Close", "Open", "High", "Low", "Volume", "Target"]
nifty50 = nifty50.dropna(subset=essential_columns)
print(f"Rows after dropna: {nifty50.shape[0]}")

Rows after dropna: 4294


In [19]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [20]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >= .6] = 1
    preds[preds < .6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [21]:
predictions["Predictions"].value_counts()

Predictions
1    1030
0     764
Name: count, dtype: int64

In [22]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5339805825242718

In [23]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1    0.544036
0    0.455964
Name: count, dtype: float64

In [24]:
import joblib


joblib.dump(model, "nifty50_model.pkl")


nifty50.to_csv("nifty50_updated.csv")



In [25]:
predictions


Unnamed: 0_level_0,Target,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-08 00:00:00+05:30,1,0
2017-12-11 00:00:00+05:30,0,1
2017-12-12 00:00:00+05:30,0,1
2017-12-13 00:00:00+05:30,1,1
2017-12-14 00:00:00+05:30,1,1
...,...,...
2025-03-17 00:00:00+05:30,1,1
2025-03-18 00:00:00+05:30,1,0
2025-03-19 00:00:00+05:30,1,0
2025-03-20 00:00:00+05:30,1,1
