In [10]:
import pandas as pd
import numpy as np
import random

# Reproducibility
np.random.seed(42)

# Possible values
train_numbers = np.random.randint(12000, 13000, 500)
train_types = ['Express', 'Superfast', 'Rajdhani', 'Shatabdi', 'Mail', 'Vande Bharat', 'Duranto']
stations = ['NDLS', 'HWH', 'PNBE', 'UMB', 'CSMT', 'CNB', 'DDU', 'MAS', 'BZA', 'SBC']
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

data = []

# Predefined realistic distances (approx in km)
route_distance = {
    ('NDLS', 'HWH'): 1450,
    ('HWH', 'NDLS'): 1450,
    ('NDLS', 'PNBE'): 1000,
    ('PNBE', 'NDLS'): 1000,
    ('CSMT', 'MAS'): 1270,
    ('MAS', 'CSMT'): 1270,
    ('NDLS', 'UMB'): 250,
    ('UMB', 'NDLS'): 250,
    ('CNB', 'DDU'): 340,
    ('DDU', 'CNB'): 340,
    ('SBC', 'MAS'): 360,
    ('MAS', 'SBC'): 360,
    ('BZA', 'MAS'): 430,
    ('MAS', 'BZA'): 430
}


for _ in range(500):
    source, destination = random.sample(stations, 2)
    train_type = random.choice(train_types)
    day = random.choice(days)
    month = random.randint(1, 12)

    # Fixed distance per route
    distance = route_distance.get(
        (source, destination),
        random.randint(300, 1800)
    )

    # Base delay
    base_delay = np.random.normal(10, 8)

    # Train type effect
    if train_type == ['Mail', 'Express']:
        base_delay += 20
    elif train_type in ['Rajdhani', 'Vande Bharat', 'Duranto']:
        base_delay -= 8

    # Weekend effect
    if day in ['Saturday', 'Sunday']:
        base_delay += 10

    # Distance proportional delay
    distance_delay = (distance / 500) * 5

    # Seasonal effect
    if month in [12, 1]:              # Winter fog
        seasonal_delay = 30
    elif month in [7, 8, 9]:          # Monsoon
        seasonal_delay = 20
    elif month in [4, 5, 6]:          # Summer
        seasonal_delay = 5
    else:
        seasonal_delay = 0

    # Total delay
    delay = base_delay + distance_delay + seasonal_delay
    delay = max(0, int(delay))

    data.append([
        random.choice(train_numbers),
        train_type,
        source,
        destination,
        distance,
        day,
        month,
        delay
    ])


columns = [
    'train_number',
    'train_type',
    'source',
    'destination',
    'distance_km',
    'day_of_week',
    'month',
    'delay_minutes'
]

df = pd.DataFrame(data, columns=columns)

# Save CSV
df.to_csv("../data/raw/train_delay.csv", index=False)

df.head()


Unnamed: 0,train_number,train_type,source,destination,distance_km,day_of_week,month,delay_minutes
0,12574,Shatabdi,MAS,NDLS,1086,Monday,7,45
1,12794,Superfast,NDLS,CNB,693,Thursday,3,8
2,12951,Superfast,PNBE,HWH,843,Sunday,8,37
3,12957,Vande Bharat,HWH,DDU,1283,Saturday,4,23
4,12452,Rajdhani,CSMT,SBC,1476,Saturday,10,18
