In [13]:
from pandas import read_csv, DataFrame
from random import random
import time as time

In [14]:
# This script assumes that time gap between 2 stations would always remain same

In [15]:
# These are the routes from end station to end station
end_to_end_routes = [
  ['CSMT', 'MASJID', 'SANDHURST ROAD', 'BYCULLA', 'CHINCHPOKLI', 'CURREY ROAD',
  'PAREL', 'DADAR', 'MATUNGA', 'SION', 'KURLA', 'VIDYAVIHAR', 'GHATKOPAR',
  'VIKHROLI', 'KANJUR MARG', 'BHANDUP', 'NAHUR', 'MULUND', 'THANE', 'KALVA',
  'MUMBRA', 'DIVA JN', 'KOPAR', 'DOMBIVLI', 'THAKURLI', 'KALYAN', 'SHAHAD',
  'AMBIVLI', 'TITWALA', 'KHADAVLI', 'VASIND', 'ASANGAON', 'ATGAON', 'THANSIT',
  'KHARDI', 'UMBERMALI', 'KASARA'],
  ['CSMT', 'MASJID', 'SANDHURST ROAD', 'BYCULLA', 'CHINCHPOKLI', 'CURREY ROAD',
  'PAREL', 'DADAR', 'MATUNGA', 'SION', 'KURLA', 'VIDYAVIHAR', 'GHATKOPAR',
  'VIKHROLI', 'KANJUR MARG', 'BHANDUP', 'NAHUR', 'MULUND', 'THANE', 'KALVA',
  'MUMBRA', 'DIVA JN', 'KOPAR', 'DOMBIVLI', 'THAKURLI', 'KALYAN', 'VITHALWADI',
  'ULHAS NAGAR', 'AMBERNATH', 'BADLAPUR', 'VANGANI', 'SHELU', 'NERAL',
  'BHIVPURI ROAD', 'KARJAT', 'PALASDHARI', 'KELAVLI', 'DOLAVLI', 'LOWJEE',
  'KHOPOLI']
]

In [16]:
# Time between stations
time_between_stations = [
  [3, 2, 3, 2, 2, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 5, 4, 6, 4, 5, 3, 4, 7, 4,
  3, 6, 7, 7, 9, 9, 4, 6, 5, 13],
  [3, 2, 3, 2, 2, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 5, 4, 6, 4, 5, 3, 4, 7, 4,
  3, 4, 7, 9, 4, 4, 7, 9, 5, 7, 3, 4, 6]
]

In [17]:
# Initialize an empty dictionary of routes from all station to all station
routes = dict()

In [18]:
# Initialize an empty dictionary of time delays between adjacent stations
time_delay = dict()

In [19]:
# For all end to end routes
for a, e2e_route in enumerate(end_to_end_routes):

  # For all start stations
  for i, start in enumerate(e2e_route):

    # For all end stations
    for j, end in enumerate(e2e_route):

      # If they are not equal
      if start != end:

        # Store the list index and sub list indices of both direction
        routes[start, end] = a, i, j
        routes[end, start] = a, j, i

        # Store the time delay if the stations are adjacent
        if abs(i - j) == 1:
          time_delay[frozenset([start,end])] = time_between_stations[a][min(i,j)]

In [20]:
# Open the train_data file
with open('train_data.csv', 'w') as out:

  # Write header in output
  out.write('start,end,start_time,speed,station,time\n')

  # Open the input file
  with open('input_for_generate_train_data.txt') as inp:

    # For all lines in the file
    for line in inp.readlines():

      # Extract start, end and time from line
      start, end, time = line.strip().split(',')
      start, end = start.upper(), end.upper()

      # Extract route info
      a, i, j = routes[start, end]

      # Direction for up or down
      direction = (j-i)//abs(j-i)

      # Find min and max of i and j
      mx, mn = max(i, j), min(i, j)

      # Find route from the route info
      route = end_to_end_routes[a][mn: mx+1][::direction]

      # Format the time
      start_time = f'{time[:2]}:{time[2:]}'

      # Train identifier
      train_id = f'{start},{end},{start_time},S'

      # Write info for first station
      out.write(f'{train_id},{start},{start_time}\n')

      # Extract hour and minute from time
      hour, minute = int(time[:2]), int(time[2:])

      # Store the previous station
      prev_station = start

      # For all stations except for first
      for station in route[1:]:

        # Add minute delay
        minute += time_delay[frozenset([station, prev_station])]

        # Current station becomes previous_station for next station
        prev_station = station

        # If minute is an hour or more
        if minute > 59:

          # Calculate hour and minute
          minute %= 60
          hour = (hour + 1) % 24
        
        # Write output
        out.write(f'{train_id},{station},{hour:02}:{minute:02}\n')

In [None]:
# Maximum delay that a train can be late for
MAX_DELAY = 3

# Divide the whole sample space into total parts
parts = list(range(MAX_DELAY + 1, 0, -1))

# This list is the result of the above probability distribution
delay_list = list(range(MAX_DELAY+1))

# Sum all the parts
total = sum(parts)

# Initialize an empty probability threshold list
prob_threshold = [0]

# For all parts
for p in parts:

  # Append the cummulative threshold to the list
  prob_threshold.append(prob_threshold[-1] + p/total)

# Delete the dummy first value
del prob_threshold[0]

In [None]:
# This function adds random delay to the time passed
# Probability distribution [0, MAX_DELAY] in a half bell curve
def random_delay(y, mo, d, h, mi):

  # Pick a random number in [0, 1)
  r = random()

  # For all prob threshold
  for i, prob in enumerate(prob_threshold):

    # If it crosses the threshold
    if r < prob:

      # This will be the delay
      delay = delay_list[i]
      break
  
  # Return the time after adding delay
  return time.mktime(
          time.strptime(f'{y} {mo} {d} {h} {mi}', '%Y %m %d %H %M')
         ) + delay * 60

In [None]:
# Read train_data csv as time table tt
tt = read_csv('train_data.csv')

# Initialize a none dataframe for final output data
final_data = DataFrame()

# Year is 2020
y = 2020

# Iterate month from Jan to Apr
for mo in range(1, 5):

  # Iterate for all days in the month
  for d in range(1, {1: 31, 2: 29, 3: 31, 4: 30}[mo] + 1):

    print(f'{y} {mo} {d}')

    # Make a copy of time table
    df = tt.copy()

    # Add random delay to time table time
    df['actual_time'] = df['time'].apply(
                          lambda t: random_delay(y, mo, d, *t.split(':'))
                        )

    # Make actual time human readable
    df['actual_time_str'] = df['actual_time'].apply(
                              lambda t: time.strftime(
                                '%Y/%m/%d %H:%M', time.localtime(t)
                              )
                            )

    # Append this date's data to final data
    final_data = final_data.append(df, ignore_index=True)

In [None]:
# Add random crowd data
final_data['crowd'] = random()

# Store the final data in the csv
final_data.to_csv('train_log.csv', index=False)