## Importing functions

In [22]:
import os
import pandas as pd
import zipfile


## Loading the bus routes from data

In [23]:
data_folder = "data"

# 1. Load bus routes data from CSV file
def load_bus_routes_data():
    bus_routes_path = os.path.join(data_folder, "bus_routes_data.csv")
    if os.path.exists(bus_routes_path):
        bus_routes_df = pd.read_csv(bus_routes_path)  # Use read_csv
        print("Bus Routes Data Loaded:")
        return bus_routes_df
    else:
        print(f"Error: {bus_routes_path} not found.")
        return None

# 2. Load bus stops data from CSV file
def load_bus_stops_data():
    bus_stops_path = os.path.join(data_folder, "bus_stops_data.csv")
    if os.path.exists(bus_stops_path):
        bus_stops_df = pd.read_csv(bus_stops_path)  # Use read_csv
        print("Bus Stops Data Loaded:")
        return bus_stops_df
    else:
        print(f"Error: {bus_stops_path} not found.")
        return None

# 3. Load Passenger Volume by Bus Stops data from ZIP file
def load_passenger_volume_bus_stops():
    zip_path = os.path.join(data_folder, "transport_node_bus_202408.zip")
    csv_file_name = "transport_node_bus_202408.csv"
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as z:
            with z.open(csv_file_name) as csv_file:
                passenger_volume_df = pd.read_csv(csv_file)
                print("Passenger Volume by Bus Stops Data Loaded:")
                return passenger_volume_df
    else:
        print(f"Error: {zip_path} not found.")
        return None

# 4. Load Origin-Destination Bus Stops data from ZIP file
def load_od_volume_bus_stops():
    zip_path = os.path.join(data_folder, "origin_destination_bus_202408.zip")
    csv_file_name = "origin_destination_bus_202408.csv"
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as z:
            with z.open(csv_file_name) as csv_file:
                od_volume_df = pd.read_csv(csv_file)
                print("Origin-Destination Bus Stops Data Loaded:")
                return od_volume_df
    else:
        print(f"Error: {zip_path} not found.")
        return None


## Call the data to make it run

In [24]:
print("Loading data files from the 'data' folder...\n")
    
# Load Bus Routes Data into bus_routes_df
bus_routes_df = load_bus_routes_data()

# Load Bus Stops Data into bus_stops_df
bus_stops_df = load_bus_stops_data()

# Load Passenger Volume by Bus Stops Data into passenger_volume_df
passenger_volume_df = load_passenger_volume_bus_stops()

# Load Origin-Destination Bus Stops Data into od_volume_df
od_volume_df = load_od_volume_bus_stops()

# You can now use these DataFrames for further processing
if bus_routes_df is not None:
    print(f"Bus Routes DataFrame shape: {bus_routes_df.shape}")
if bus_stops_df is not None:
    print(f"Bus Stops DataFrame shape: {bus_stops_df.shape}")
if passenger_volume_df is not None:
    print(f"Passenger Volume DataFrame shape: {passenger_volume_df.shape}")
if od_volume_df is not None:
    print(f"OD Bus Stops DataFrame shape: {od_volume_df.shape}")

Loading data files from the 'data' folder...

Bus Routes Data Loaded:
Bus Stops Data Loaded:
Passenger Volume by Bus Stops Data Loaded:
Origin-Destination Bus Stops Data Loaded:
Bus Routes DataFrame shape: (25500, 12)
Bus Stops DataFrame shape: (5137, 5)
Passenger Volume DataFrame shape: (199800, 7)
OD Bus Stops DataFrame shape: (5760081, 7)


## Examine the head of the data

In [25]:
print("\nData Samples:")
print("Bus Routes Data:")
print(bus_routes_df.head())
total_bus_routes = bus_routes_df.shape[0]
print(f"Total Bus Routes: {total_bus_routes}")
print("Bus Stops Data:")
print(bus_stops_df.head())
total_bus_stops = bus_stops_df.shape[0]
print(f"Total Bus Stops: {total_bus_stops}")
print("Passenger Volume by Bus Stops Data:")
print(passenger_volume_df.head())
print("Origin-Destination Bus Stops Data:")
print(od_volume_df.head())


Data Samples:
Bus Routes Data:
  ServiceNo Operator  Direction  StopSequence  BusStopCode  Distance  \
0        10     SBST          1             1        75009       0.0   
1        10     SBST          1             2        76059       0.6   
2        10     SBST          1             3        76069       1.1   
3        10     SBST          1             4        96289       2.3   
4        10     SBST          1             5        96109       2.7   

  WD_FirstBus WD_LastBus SAT_FirstBus SAT_LastBus SUN_FirstBus SUN_LastBus  
0        0500       2300         0500        2300         0500        2300  
1        0502       2302         0502        2302         0502        2302  
2        0504       2304         0504        2304         0503        2304  
3        0508       2308         0508        2309         0507        2308  
4        0509       2310         0509        2311         0508        2309  
Total Bus Routes: 25500
Bus Stops Data:
   BusStopCode       RoadName    

## Tagging bus stops
We noticed that if the bus stop contains the word "Stn", it is usually linked to an MRT, so we will take a look and tag it accordingly

In [29]:
bus_stops_df["is_mrt"] = bus_stops_df["Description"].str.contains("stn", case=False)
count_mrt = bus_stops_df["is_mrt"].sum()
print(f"Number of Bus Stops linked to MRT stations: {count_mrt}")
percentage_mrt = count_mrt / total_bus_stops * 100
print(f"Percentage of Bus Stops linked to MRT stations: {percentage_mrt:.2f}%")

Number of Bus Stops linked to MRT stations: 8
Percentage of Bus Stops linked to MRT stations: 0.16%
