<a href="https://colab.research.google.com/github/Ahsan97Javed/gtfs-batch-pipeline/blob/main/preprocessing_gtfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GTFS Batch Processing Pipeline — Preprocessing Microservice

## 1. Mount Google Drive & Set Paths


In [None]:
from google.colab import drive
import os
import pandas as pd

drive.mount('/content/drive')

validated_path = '/content/drive/My Drive/GTFS_VALIDATED'
cleaned_path = '/content/drive/My Drive/GTFS_CLEANED'
os.makedirs(cleaned_path, exist_ok=True)


Mounted at /content/drive


## 2. Load Validated GTFS Files


In [None]:
files = os.listdir(validated_path)
dfs = {}
for fname in files:
    if fname.endswith('.txt'):
        fpath = os.path.join(validated_path, fname)
        df = pd.read_csv(fpath)
        dfs[fname] = df
        print(f"Loaded {fname}, shape: {df.shape}")


Loaded calendar.txt, shape: (5553, 10)
Loaded feed_info.txt, shape: (1, 8)
Loaded stops.txt, shape: (677435, 6)
Loaded routes.txt, shape: (25081, 5)
Loaded calendar_dates.txt, shape: (13229, 3)
Loaded agency.txt, shape: (451, 5)
Loaded trips.txt, shape: (1618937, 3)
Loaded attributions.txt, shape: (3, 8)
Loaded stop_times.txt, shape: (32966240, 7)


## 3. Data Cleaning & Preprocessing

- Handle missing values
- Standardize column types
- Normalize timestamps/time columns
- Remove duplicates
- Ensure data consistency


In [None]:
import numpy as np

# Preprocessing per GTFS file
for fname, df in dfs.items():
    print(f"\nCleaning {fname}:")

    # Remove exact duplicates
    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]
    print(f" - Dropped {before - after} duplicate rows.")

    # Remove empty rows (all NA or blank)
    before = df.shape[0]
    df = df.dropna(how='all')
    after = df.shape[0]
    print(f" - Dropped {before - after} fully empty rows.")

    # Strip whitespace from all string columns
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    # Custom cleaning by file type
    if fname == 'stops.txt':
        # Remove rows with missing stop_id
        before = df.shape[0]
        df = df[~df['stop_id'].isna() & (df['stop_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required stop_id.")
        # Enforce float type for lat/lon, set to NaN if invalid
        for col in ['stop_lat', 'stop_lon']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        # Replace missing names with 'Unknown Stop'
        df['stop_name'] = df['stop_name'].replace('', 'Unknown Stop').fillna('Unknown Stop')
        # Fill optional columns with blank
        for col in ['parent_station', 'location_type']:
            if col in df.columns:
                df[col] = df[col].fillna('').replace(np.nan, '', regex=True)

    elif fname == 'routes.txt':
        # Remove rows with missing route_id
        before = df.shape[0]
        df = df[~df['route_id'].isna() & (df['route_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required route_id.")
        # Standardize route_type as integer, set invalid to -1
        if 'route_type' in df.columns:
            df['route_type'] = pd.to_numeric(df['route_type'], errors='coerce').fillna(-1).astype(int)
        # Fill optional columns with blank
        for col in ['route_short_name', 'agency_id', 'route_long_name']:
            if col in df.columns:
                df[col] = df[col].fillna('').replace(np.nan, '', regex=True)

    elif fname == 'stop_times.txt':
        # Remove rows with missing trip_id or stop_id
        before = df.shape[0]
        df = df[~df['trip_id'].isna() & (df['trip_id'] != "") & ~df['stop_id'].isna() & (df['stop_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing trip_id or stop_id.")
        # Standardize and pad time columns to HH:MM:SS
        for col in ['arrival_time', 'departure_time']:
            if col in df.columns:
                # GTFS allows times >24:00 (use as string)
                df[col] = df[col].astype(str).apply(
                    lambda x: x if pd.isna(x) or x == '' else
                    ':'.join([i.zfill(2) for i in x.split(":")])
                )
        # Fill NA for pickup/drop_off types with '0'
        for col in ['pickup_type', 'drop_off_type']:
            if col in df.columns:
                df[col] = df[col].fillna('0').replace('', '0')

    elif fname == 'trips.txt':
        # Remove rows with missing trip_id
        before = df.shape[0]
        df = df[~df['trip_id'].isna() & (df['trip_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required trip_id.")
        # Fill missing route_id/service_id with blank
        for col in ['route_id', 'service_id']:
            if col in df.columns:
                df[col] = df[col].fillna('').replace(np.nan, '', regex=True)

    elif fname == 'agency.txt':
        # Remove rows with missing agency_id
        before = df.shape[0]
        df = df[~df['agency_id'].isna() & (df['agency_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required agency_id.")
        # Fill optional columns with blank
        for col in ['agency_name', 'agency_url', 'agency_timezone', 'agency_lang']:
            if col in df.columns:
                df[col] = df[col].fillna('').replace(np.nan, '', regex=True)

    elif fname == 'calendar.txt':
        # Remove rows missing service_id
        before = df.shape[0]
        df = df[~df['service_id'].isna() & (df['service_id'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required service_id.")
        # Days of week as int 0/1
        for day in ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']:
            if day in df.columns:
                df[day] = pd.to_numeric(df[day], errors='coerce').fillna(0).astype(int)
        # Dates as string or int
        for col in ['start_date','end_date']:
            if col in df.columns:
                df[col] = df[col].astype(str).str[:8]  # keep as YYYYMMDD

    elif fname == 'calendar_dates.txt':
        # Remove rows missing service_id or date
        before = df.shape[0]
        df = df[~df['service_id'].isna() & (df['service_id'] != "") & ~df['date'].isna() & (df['date'] != "")]
        after = df.shape[0]
        print(f" - Removed {before - after} rows missing required service_id or date.")
        # exception_type as int 1/2
        if 'exception_type' in df.columns:
            df['exception_type'] = pd.to_numeric(df['exception_type'], errors='coerce').fillna(1).astype(int)
        # Dates as string or int
        if 'date' in df.columns:
            df['date'] = df['date'].astype(str).str[:8]

    elif fname == 'feed_info.txt':
        # Fill missing publisher name/url with 'Unknown'
        for col in ['feed_publisher_name', 'feed_publisher_url']:
            if col in df.columns:
                df[col] = df[col].replace('', 'Unknown').fillna('Unknown')
        # Dates as string
        for col in ['feed_start_date','feed_end_date']:
            if col in df.columns:
                df[col] = df[col].astype(str).str[:8]
        # Fill other missing fields with blank
        for col in df.columns:
            df[col] = df[col].fillna('').replace(np.nan, '', regex=True)

    elif fname == 'attributions.txt':
        # Fill required columns with 'Unknown' if missing
        for col in ['organization_name']:
            if col in df.columns:
                df[col] = df[col].replace('', 'Unknown').fillna('Unknown')
        # Fill optional boolean columns with 0
        for col in ['is_producer','is_operator','is_authority']:
            if col in df.columns:
                df[col] = df[col].fillna('0').replace('', '0')
        # Fill other missing fields with blank
        for col in df.columns:
            df[col] = df[col].fillna('').replace(np.nan, '', regex=True)
    else:
        # For any non-standard file: fillna with blank, drop dups, strip whitespace
        df = df.drop_duplicates()
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
        df = df.fillna('')
        print(" - Cleaned generic GTFS extension file.")

    dfs[fname] = df


Cleaning calendar.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.
 - Removed 0 rows missing required service_id.

Cleaning feed_info.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.

Cleaning stops.txt:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


 - Removed 0 rows missing required stop_id.

Cleaning routes.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.
 - Removed 0 rows missing required route_id.

Cleaning calendar_dates.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.
 - Removed 0 rows missing required service_id or date.

Cleaning agency.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.
 - Removed 0 rows missing required agency_id.

Cleaning trips.txt:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


 - Removed 0 rows missing required trip_id.

Cleaning attributions.txt:
 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.

Cleaning stop_times.txt:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


 - Dropped 0 duplicate rows.
 - Dropped 0 fully empty rows.
 - Removed 0 rows missing trip_id or stop_id.


## 4. Save Cleaned Data

In [None]:
for fname, df in dfs.items():
    df.to_csv(os.path.join(cleaned_path, fname), index=False)
print(f"All cleaned files saved to {cleaned_path}")


All cleaned files saved to /content/drive/My Drive/GTFS_CLEANED
