In [2]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

from scipy import stats

In [3]:
df = pd.read_csv("NSEI.csv")

##### Preparing data

Drop column 'Adj Close'

In [4]:
df.drop(columns="Adj Close")

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2025-02-14,23096.45,23133.70,22774.85,22929.25,254500.0
1,2025-02-13,23055.75,23235.50,22992.20,23031.40,265700.0
2,2025-02-12,23050.80,23144.70,22798.35,23045.25,279700.0
3,2025-02-11,23383.55,23390.05,22986.65,23071.80,268000.0
4,2025-02-10,23543.80,23568.60,23316.30,23381.60,234200.0
...,...,...,...,...,...,...
4266,2007-09-21,4752.95,4855.70,4733.70,4837.55,0.0
4267,2007-09-20,4734.85,4760.85,4721.15,4747.55,0.0
4268,2007-09-19,4550.25,4739.00,4550.25,4732.35,0.0
4269,2007-09-18,4494.10,4551.80,4481.55,4546.20,0.0


Changing data type

In [5]:
df['Date'] = pd.to_datetime(df['Date'])

##### Create Rules

Rule 1

In [6]:
# buy => 0
# sell => 1
# Buy if Open > Close otherwise sell
df['Rule 1'] = (df['Open'] > df['Close']).astype(int)

Rule 2

In [7]:
# Buy if Open = Low
# Sell if Open = High
# Buy if Open > Mean of High and Low

def ruleTwo(row):
    if row['Open'] == row['Low']:
        return 0
    elif row['Open'] == row['High']:
        return 1
    elif row['Open'] > np.mean([row['High'], row['Low']]):
        return 0
    return 1

df['Rule 2'] = df.apply(ruleTwo, axis=1)

TP

In [8]:
df['TP'] = df[['High', 'Low', 'Close']].mean(axis=1)

Rule 3

In [9]:
df['Rule 3'] = (df['TP'] < df['TP'].shift(1)).astype(int)

##### Create 'classifier'

In [10]:
df['classifier'] = stats.mode(df[['Rule 1', 'Rule 2', 'Rule 3']], axis=1).mode

In [18]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Rule 1,Rule 2,TP,Rule 3,classifier
0,2025-02-14,23096.45,23133.7,22774.85,22929.25,22929.25,254500.0,1,0,22945.933333,0,0
1,2025-02-13,23055.75,23235.5,22992.2,23031.4,23031.4,265700.0,1,1,23086.366667,0,1
2,2025-02-12,23050.8,23144.7,22798.35,23045.25,23045.25,279700.0,1,0,22996.1,1,1
3,2025-02-11,23383.55,23390.05,22986.65,23071.8,23071.8,268000.0,1,0,23149.5,0,0
4,2025-02-10,23543.8,23568.6,23316.3,23381.6,23381.6,234200.0,1,0,23422.166667,0,0


In [17]:
df.to_csv("NSEI (Rules).csv", index=False)

##### Split in Old and New NSEI.

In [11]:
dateSplit = pd.to_datetime('01-01-2025', format="%d-%m-%Y")
newNSEI = df[df['Date'] >= dateSplit]
oldNSEI = df[df['Date'] < dateSplit]

In [16]:
oldNSEI[['Rule 1', 'Rule 2', 'TP', 'Rule 3']].head()

Unnamed: 0,Rule 1,Rule 2,TP,Rule 3
33,0,1,23598.366667,1
34,1,0,23719.85,0
35,0,1,23850.95,0
36,1,0,23752.766667,1
37,1,1,23760.15,0


#### Hyper Parameter Tuning

In [None]:
params = [
	{
		"model": DecisionTreeClassifier(),
		"params": {
			"criterion": ["gini", "entropy"],
			"splitter": ["best", "random"],
			"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
			"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
			"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
		},
	},
	{
		"model": LogisticRegression(max_iter=1000),
		"params": {
			"penalty": ["l1", "l2"],
			"C": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
			"solver": ["liblinear", "saga"],
			"max_iter": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
		},
	},
	{
		"model": KNeighborsClassifier(),
		"params": {
			"n_neighbors": [5, 7, 9, 11, 13, 15],
			"weights": ["uniform", "distance"],
			"metric": ["minkowski", "euclidean", "manhattan"]
		},
	},
	{
		"model": SVC(),
		"params": {
			"C": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
			"kernel": ["linear", "poly", "rbf", "sigmoid"],
			"gamma": ["scale", "auto"],
			"decision_function_shape": ["ovo", "ovr"]
		}
	}
]

In [None]:
bestParams = []
tune = False
if tune:
	for param in params:
		gs = GridSearchCV(param['model'], param['params'], cv=3)
		gs.fit(newNSEI[['Open', 'High', 'Low', 'Close']], newNSEI['classifier'])
		bestParams.append({'param': gs.best_params_, 'score': gs.best_score_, 'model': param['model'] })

#### Model fitting

In [None]:
models = []
for i, param in enumerate(params):
    models.append(param['model'])
    models[i].fit(oldNSEI[['Open', 'High', 'Low', 'Close']], oldNSEI['classifier'])

In [None]:
for model in models:
    newNSEI[str(model)] = model.predict(newNSEI[['Open', 'High', 'Low', 'Close']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newNSEI[str(model)] = model.predict(newNSEI[['Open', 'High', 'Low', 'Close']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newNSEI[str(model)] = model.predict(newNSEI[['Open', 'High', 'Low', 'Close']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newNSEI[str(model)] = model.predict(newNSEI[['O

In [None]:
newNSEI[['classifier', str(models[0]), str(models[1]), str(models[2]), str(models[3])]]

Unnamed: 0,classifier,DecisionTreeClassifier(),LogisticRegression(max_iter=1000),KNeighborsClassifier(),SVC()
0,0,0,0,0,1
1,1,0,1,0,1
2,1,1,1,0,1
3,0,0,0,1,1
4,0,1,0,1,1
5,0,1,0,1,1
6,0,1,0,1,1
7,0,0,0,1,1
8,1,1,1,1,1
9,0,1,1,1,1


In [None]:
for model in models:
    print(model, (newNSEI[str(model)] == newNSEI['classifier']).sum() / newNSEI['classifier'].count(), sep=": ")
print()
for model in models:
    print(model, accuracy_score(newNSEI[str(model)], newNSEI['classifier']), sep=": ")

DecisionTreeClassifier(): 0.36363636363636365
LogisticRegression(max_iter=1000): 0.7272727272727273
KNeighborsClassifier(): 0.36363636363636365
SVC(): 0.42424242424242425

DecisionTreeClassifier(): 0.36363636363636365
LogisticRegression(max_iter=1000): 0.7272727272727273
KNeighborsClassifier(): 0.36363636363636365
SVC(): 0.42424242424242425
