# Data Analysis of the IRN

In [98]:
import typing

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

np.random.seed(7)

## Preprocessing the table

In [118]:
PATH = "../data/Train_details_22122017.csv"
df = pd.read_csv(PATH, header=0)

  df = pd.read_csv(PATH, header=0)


In [120]:
print("Rows with at least one NA value:", df.isna().any(axis=1).value_counts()[True])

Rows with at least one NA value: 10


In [121]:
# Since this is a small number, we can just drop these rows.
df.dropna(inplace=True, how="any")
# Reset the index as dropping rows deletes the index values creating jumps.
# If row 2 were deleted. The index would be 0, 1, 3, 4, ...
df.reset_index(drop=True, inplace=True)

In [122]:
df.head()

Unnamed: 0,Train No,Train Name,SEQ,Station Code,Station Name,Arrival time,Departure Time,Distance,Source Station,Source Station Name,Destination Station,Destination Station Name
0,107,SWV-MAO-VLNK,1,SWV,SAWANTWADI R,0:00:00,10:25:00,0,SWV,SAWANTWADI ROAD,MAO,MADGOAN JN.
1,107,SWV-MAO-VLNK,2,THVM,THIVIM,11:06:00,11:08:00,32,SWV,SAWANTWADI ROAD,MAO,MADGOAN JN.
2,107,SWV-MAO-VLNK,3,KRMI,KARMALI,11:28:00,11:30:00,49,SWV,SAWANTWADI ROAD,MAO,MADGOAN JN.
3,107,SWV-MAO-VLNK,4,MAO,MADGOAN JN.,12:10:00,0:00:00,78,SWV,SAWANTWADI ROAD,MAO,MADGOAN JN.
4,108,VLNK-MAO-SWV,1,MAO,MADGOAN JN.,0:00:00,20:30:00,0,MAO,MADGOAN JN.,SWV,SAWANTWADI ROAD


In [123]:
# Change int columns to have int datatype
df = df.astype({
    "Train No": int,
    "SEQ": int,
    "Distance": int
})
# Change time columns to have time datatype
df["Arrival time"] = pd.to_datetime(df["Arrival time"], format="%H:%M:%S").dt.time
df["Departure Time"] = pd.to_datetime(df["Departure Time"], format="%H:%M:%S").dt.time

## Extract graph

In [124]:
def generate_graph(dataframe: pd.DataFrame, distance_weighted: bool = False) -> nx.DiGraph:
    # Create an empty directed graph.
    graph = nx.DiGraph()
    # Iterate over the "Station Code" series. Skip the first station.
    for i in range(1, dataframe["Station Code"].shape[0]):
        # Get the current and previous station.
        current_station = dataframe["Station Code"][i]
        previous_station = dataframe["Station Code"][i - 1]

        # If the train no. doesn't match between the two stations, ignore.
        if dataframe["Train No"][i] != dataframe["Train No"][i - 1]:
            continue

        # Set the edge weight to the distance between the current station
        # and the previous station if using distance as the edge weight.
        # Set the edge weight to 1 if using #trains as weights.
        edge_weight = dataframe["Distance"][i] if distance_weighted else 1

        # If no edge exists:
        if not graph.has_edge(previous_station, current_station):
            # Add an edge and set the appropriate weight.
            graph.add_edge(previous_station, current_station, weight=edge_weight)
        # Else, increment the weight if edge weight is #trains.
        else:
            if not distance_weighted:
                graph.edges[previous_station, current_station]["weight"] += edge_weight
    return graph

In [128]:
irn = generate_graph(dataframe=df, distance_weighted=False)
print(irn)

DiGraph with 8147 nodes and 28189 edges


In [129]:
irn_dist = generate_graph(dataframe=df, distance_weighted=True)
print(irn_dist)

DiGraph with 8147 nodes and 28189 edges


## Exploratory analysis