In [2]:
import pandas as pd

candle_stick_data = pd.read_csv("dataset/nifty50_candlestick_data.csv")
candle_stick_data["datetime"] = pd.to_datetime(candle_stick_data["Date"] + " " + candle_stick_data["Time"], format="%d-%m-%Y %H:%M:%S")
candle_stick_data.set_index("datetime", inplace=True)
candle_stick_data.drop(columns=["Date", "Time", "High", "Low", "Close", "Instrument"], inplace=True, errors="ignore")

n50_minute_level_opens = candle_stick_data
n50_minute_level_opens.head()

Unnamed: 0_level_0,Open
datetime,Unnamed: 1_level_1
2015-01-09 09:15:00,8285.45
2015-01-09 09:16:00,8292.6
2015-01-09 09:17:00,8287.4
2015-01-09 09:18:00,8294.25
2015-01-09 09:19:00,8300.6


In [3]:
market_hours_filter = (n50_minute_level_opens.index.time >= pd.Timestamp('09:15:00').time()) & \
                      (n50_minute_level_opens.index.time <= pd.Timestamp('15:30:00').time())

n50_min_opens = n50_minute_level_opens[market_hours_filter].copy()

n50_min_opens['date'] = n50_min_opens.index.date
n50_min_opens['time'] = n50_min_opens.index.strftime('%H:%M')

n50_daily_opens = n50_min_opens.pivot_table(
    index='date',
    columns='time',
    values='Open',
    aggfunc='first'  # In case there are duplicates, take the first value
)

n50_daily_opens.head()

time,09:15,09:16,09:17,09:18,09:19,09:20,09:21,09:22,09:23,09:24,...,15:20,15:21,15:22,15:23,15:24,15:25,15:26,15:27,15:28,15:29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,8285.45,8292.6,8287.4,8294.25,8300.6,8300.5,8300.65,8302.45,8294.85,8295.2,...,8280.8,8282.35,8283.4,8284.35,8286.9,8286.65,8283.45,8282.35,8283.25,8280.5
2015-01-12,8291.35,8254.2,8255.25,8258.15,8263.2,8267.45,8266.05,8268.8,8273.85,8266.75,...,8329.5,8326.55,8328.05,8328.05,8327.2,8330.2,8330.9,8329.95,8329.95,8328.85
2015-01-13,8346.15,8355.15,8348.7,8344.5,8342.5,8340.35,8339.75,8340.45,8333.3,8326.05,...,8304.9,8305.75,8306.5,8307.15,8308.0,8308.2,8308.25,8307.25,8305.85,8308.2
2015-01-14,8307.25,8300.85,8307.0,8309.05,8305.4,8304.7,8302.2,8293.1,8296.7,8306.85,...,8280.1,8278.9,8280.9,8283.6,8284.3,8285.35,8285.5,8286.95,8288.3,8288.9
2015-01-15,8425.2,8440.45,8394.35,8386.05,8401.1,8428.0,8408.25,8398.0,8416.7,8421.95,...,8497.6,8491.8,8482.05,8477.25,8468.0,8463.8,8469.05,8464.8,8467.25,8467.45


In [None]:
# Calculate percentage price movements within each day
# For each day, calculate percentage change from previous minute
n50_daily_price_movements = n50_daily_opens.pct_change(axis=1, fill_method=None) * 100

# Set the first column (first minute of each day) to 0 as there's no reference price
n50_daily_price_movements.iloc[:, 0] = 0

Price movements DataFrame shape: (2273, 375)
NaN values remaining: 0


time,09:15,09:16,09:17,09:18,09:19,09:20,09:21,09:22,09:23,09:24,...,15:20,15:21,15:22,15:23,15:24,15:25,15:26,15:27,15:28,15:29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,0.0,0.09,-0.06,0.08,0.08,-0.0,0.0,0.02,-0.09,0.0,...,-0.01,0.02,0.01,0.01,0.03,-0.0,-0.04,-0.01,0.01,-0.03
2015-01-12,0.0,-0.45,0.01,0.04,0.06,0.05,-0.02,0.03,0.06,-0.09,...,0.04,-0.04,0.02,0.0,-0.01,0.04,0.01,-0.01,0.0,-0.01
2015-01-13,0.0,0.11,-0.08,-0.05,-0.02,-0.03,-0.01,0.01,-0.09,-0.09,...,0.08,0.01,0.01,0.01,0.01,0.0,0.0,-0.01,-0.02,0.03
2015-01-14,0.0,-0.08,0.07,0.02,-0.04,-0.01,-0.03,-0.11,0.04,0.12,...,0.08,-0.01,0.02,0.03,0.01,0.01,0.0,0.02,0.02,0.01
2015-01-15,0.0,0.18,-0.55,-0.1,0.18,0.32,-0.23,-0.12,0.22,0.06,...,-0.04,-0.07,-0.11,-0.06,-0.11,-0.05,0.06,-0.05,0.03,0.0


In [13]:
# Fill NaN values with previous value (forward fill along rows)
n50_daily_price_movements = n50_daily_price_movements.ffill(axis=1)

# Round to 2 decimal places
n50_daily_price_movements = n50_daily_price_movements.round(2)

print(f"Price movements DataFrame shape: {n50_daily_price_movements.shape}")
print(f"NaN values remaining: {n50_daily_price_movements.isna().sum().sum()}")

n50_daily_price_movements.head()

Price movements DataFrame shape: (2273, 375)
NaN values remaining: 0


time,09:15,09:16,09:17,09:18,09:19,09:20,09:21,09:22,09:23,09:24,...,15:20,15:21,15:22,15:23,15:24,15:25,15:26,15:27,15:28,15:29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,0.0,0.09,-0.06,0.08,0.08,-0.0,0.0,0.02,-0.09,0.0,...,-0.01,0.02,0.01,0.01,0.03,-0.0,-0.04,-0.01,0.01,-0.03
2015-01-12,0.0,-0.45,0.01,0.04,0.06,0.05,-0.02,0.03,0.06,-0.09,...,0.04,-0.04,0.02,0.0,-0.01,0.04,0.01,-0.01,0.0,-0.01
2015-01-13,0.0,0.11,-0.08,-0.05,-0.02,-0.03,-0.01,0.01,-0.09,-0.09,...,0.08,0.01,0.01,0.01,0.01,0.0,0.0,-0.01,-0.02,0.03
2015-01-14,0.0,-0.08,0.07,0.02,-0.04,-0.01,-0.03,-0.11,0.04,0.12,...,0.08,-0.01,0.02,0.03,0.01,0.01,0.0,0.02,0.02,0.01
2015-01-15,0.0,0.18,-0.55,-0.1,0.18,0.32,-0.23,-0.12,0.22,0.06,...,-0.04,-0.07,-0.11,-0.06,-0.11,-0.05,0.06,-0.05,0.03,0.0


In [14]:
# Calculate daily standard deviation for each trading day
daily_std = n50_daily_price_movements.std(axis=1)  # std across columns (time) for each day

print(f"Daily std statistics:")
print(f"Mean daily std: {daily_std.mean():.4f}%")
print(f"Std of daily std: {daily_std.std():.4f}%")
print(f"Min daily std: {daily_std.min():.4f}%")
print(f"Max daily std: {daily_std.max():.4f}%")

# Calculate the mean and std of daily standard deviations
mean_daily_std = daily_std.mean()
std_daily_std = daily_std.std()

# Define the acceptable range (±2σ)
lower_bound = mean_daily_std - 2 * std_daily_std
upper_bound = mean_daily_std + 2 * std_daily_std

print(f"\nAcceptable daily std range: {lower_bound:.4f}% to {upper_bound:.4f}%")

# Filter days that fall within ±2σ of mean daily std
days_within_2sigma = (daily_std >= lower_bound) & (daily_std <= upper_bound)

print(f"\nDays analysis:")
print(f"Total days before filtering: {len(n50_daily_price_movements)}")
print(f"Days within ±2σ: {days_within_2sigma.sum()}")
print(f"Days to remove: {len(n50_daily_price_movements) - days_within_2sigma.sum()}")
print(f"Percentage kept: {days_within_2sigma.sum() / len(n50_daily_price_movements) * 100:.2f}%")

# Apply the filter
n50_daily_price_movements_filtered = n50_daily_price_movements[days_within_2sigma]
n50_daily_opens_filtered = n50_daily_opens[days_within_2sigma]

print(f"\nFiltered dataset shape:")
print(f"Price movements: {n50_daily_price_movements_filtered.shape}")
print(f"Daily opens: {n50_daily_opens_filtered.shape}")

# Show some examples of removed days (outliers)
outlier_days = n50_daily_price_movements[~days_within_2sigma]
if len(outlier_days) > 0:
    print(f"\nExamples of removed days (high/low volatility):")
    print(f"Highest volatility day: {daily_std.idxmax()} (std: {daily_std.max():.4f}%)")
    print(f"Lowest volatility day: {daily_std.idxmin()} (std: {daily_std.min():.4f}%)")

n50_daily_price_movements_filtered.head()

Daily std statistics:
Mean daily std: 0.0351%
Std of daily std: 0.0211%
Min daily std: 0.0104%
Max daily std: 0.4735%

Acceptable daily std range: -0.0070% to 0.0772%

Days analysis:
Total days before filtering: 2273
Days within ±2σ: 2221
Days to remove: 52
Percentage kept: 97.71%

Filtered dataset shape:
Price movements: (2221, 375)
Daily opens: (2221, 375)

Examples of removed days (high/low volatility):
Highest volatility day: 2020-03-13 (std: 0.4735%)
Lowest volatility day: 2024-03-02 (std: 0.0104%)


time,09:15,09:16,09:17,09:18,09:19,09:20,09:21,09:22,09:23,09:24,...,15:20,15:21,15:22,15:23,15:24,15:25,15:26,15:27,15:28,15:29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,0.0,0.09,-0.06,0.08,0.08,-0.0,0.0,0.02,-0.09,0.0,...,-0.01,0.02,0.01,0.01,0.03,-0.0,-0.04,-0.01,0.01,-0.03
2015-01-12,0.0,-0.45,0.01,0.04,0.06,0.05,-0.02,0.03,0.06,-0.09,...,0.04,-0.04,0.02,0.0,-0.01,0.04,0.01,-0.01,0.0,-0.01
2015-01-13,0.0,0.11,-0.08,-0.05,-0.02,-0.03,-0.01,0.01,-0.09,-0.09,...,0.08,0.01,0.01,0.01,0.01,0.0,0.0,-0.01,-0.02,0.03
2015-01-14,0.0,-0.08,0.07,0.02,-0.04,-0.01,-0.03,-0.11,0.04,0.12,...,0.08,-0.01,0.02,0.03,0.01,0.01,0.0,0.02,0.02,0.01
2015-01-15,0.0,0.18,-0.55,-0.1,0.18,0.32,-0.23,-0.12,0.22,0.06,...,-0.04,-0.07,-0.11,-0.06,-0.11,-0.05,0.06,-0.05,0.03,0.0


In [None]:
# Split into training and validation sets
# Every 9th day goes to validation, rest goes to training
total_days = len(n50_daily_price_movements_filtered)

# Create boolean masks for train/validation split
validation_mask = [(i % 9 == 8) for i in range(total_days)]  # Every 9th day (0-indexed, so 8th position)
training_mask = [not val for val in validation_mask]

# Split the datasets
train_price_movements = n50_daily_price_movements_filtered[training_mask]
val_price_movements = n50_daily_price_movements_filtered[validation_mask]

train_daily_opens = n50_daily_opens_filtered[training_mask]
val_daily_opens = n50_daily_opens_filtered[validation_mask]



Dataset split summary:
Total filtered days: 2221
Training days: 1975 (88.9%)
Validation days: 246 (11.1%)

Training set shape:
Price movements: (1975, 375)
Daily opens: (1975, 375)

Validation set shape:
Price movements: (246, 375)
Daily opens: (246, 375)

Sample training dates (first 5):
[datetime.date(2015, 1, 9), datetime.date(2015, 1, 12), datetime.date(2015, 1, 13), datetime.date(2015, 1, 14), datetime.date(2015, 1, 15)]

Sample validation dates (first 5):
[datetime.date(2015, 1, 22), datetime.date(2015, 2, 5), datetime.date(2015, 2, 19), datetime.date(2015, 3, 4), datetime.date(2015, 3, 18)]

Validation check:
Overlap between train and validation: 0 days
Total unique dates: 2221


time,09:15,09:16,09:17,09:18,09:19,09:20,09:21,09:22,09:23,09:24,...,15:20,15:21,15:22,15:23,15:24,15:25,15:26,15:27,15:28,15:29
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,0.0,0.09,-0.06,0.08,0.08,-0.0,0.0,0.02,-0.09,0.0,...,-0.01,0.02,0.01,0.01,0.03,-0.0,-0.04,-0.01,0.01,-0.03
2015-01-12,0.0,-0.45,0.01,0.04,0.06,0.05,-0.02,0.03,0.06,-0.09,...,0.04,-0.04,0.02,0.0,-0.01,0.04,0.01,-0.01,0.0,-0.01
2015-01-13,0.0,0.11,-0.08,-0.05,-0.02,-0.03,-0.01,0.01,-0.09,-0.09,...,0.08,0.01,0.01,0.01,0.01,0.0,0.0,-0.01,-0.02,0.03
2015-01-14,0.0,-0.08,0.07,0.02,-0.04,-0.01,-0.03,-0.11,0.04,0.12,...,0.08,-0.01,0.02,0.03,0.01,0.01,0.0,0.02,0.02,0.01
2015-01-15,0.0,0.18,-0.55,-0.1,0.18,0.32,-0.23,-0.12,0.22,0.06,...,-0.04,-0.07,-0.11,-0.06,-0.11,-0.05,0.06,-0.05,0.03,0.0


In [None]:
# Save the datasets to CSV files in the dataset directory
import os

# Create dataset directory if it doesn't exist
os.makedirs('dataset', exist_ok=True)

# Save training datasets
train_price_movements.to_csv('dataset/train_price_movements.csv')
train_daily_opens.to_csv('dataset/train_daily_opens.csv')

# Save validation datasets
val_price_movements.to_csv('dataset/val_price_movements.csv')
val_daily_opens.to_csv('dataset/val_daily_opens.csv')

print("Datasets saved successfully!")


Datasets saved successfully!

Saved files:
- dataset/train_price_movements.csv ((1975, 375))
- dataset/train_daily_opens.csv ((1975, 375))
- dataset/val_price_movements.csv ((246, 375))
- dataset/val_daily_opens.csv ((246, 375))
- dataset/train_price_movements.csv: 3.77 MB
- dataset/train_daily_opens.csv: 5.81 MB
- dataset/val_price_movements.csv: 0.47 MB
- dataset/val_daily_opens.csv: 0.72 MB
