Istanbul Public Transportation Data Analysis Codes

# Cleaning GTFS data
This notebook processes raw stops, routes, trips, and stop_times files. 
It cleans the text, handles coordinates, and converts time to seconds for analysis.
Note: This dataset is publicly available and used for data analysis purposes.

In [None]:
# Setup Libraries 

import pandas as pd

# Standard helper for text cleaning
def clean_text(x):
    if pd.isna(x):
        return x
    return str(x).strip()

# Helper function to convert time fields to seconds
def time_to_seconds(t):
    try:
        h, m, s = map(int, t.split(":"))
        return h * 3600 + m * 60 + s
    except:
        return None

print("Setup completed.")

In [None]:
# Stops Data Preprocessing

# 1) Read CSV safely
df_stops = pd.read_csv("stops.csv", encoding="latin1", on_bad_lines="skip")

# 2) Clean text fields
text_cols = ["stop_id", "stop_code", "stop_name", "stop_desc"]
for col in text_cols:
    if col in df_stops.columns:
        df_stops[col] = df_stops[col].apply(clean_text)

# 3) Drop rows missing mandatory fields & remove duplicates
df_stops = df_stops.dropna(subset=["stop_id", "stop_name", "stop_lat", "stop_lon"])
df_stops = df_stops.drop_duplicates(subset=["stop_id"])

# 4) Standardize location_type and validate coordinates
df_stops["location_type"] = df_stops["location_type"].fillna(0).astype(int)
df_stops = df_stops[
    (df_stops["stop_lat"].between(-90, 90)) & 
    (df_stops["stop_lon"].between(-180, 180))
]

# 5) Export cleaned data
df_stops.to_csv("stops_cleaned.csv", index=False)
print(f"Stops cleaning completed. Row count: {len(df_stops)}")

In [None]:
# Routes Data Preprocessing

df = pd.read_csv("routes.csv")
df.columns = df.columns.str.strip().str.lower()

# Route validation
df["route_id"] = df["route_id"].astype(str)
df["route_type"] = df["route_type"].astype("category")

text_cols = ["route_short_name", "route_long_name", "route_desc"]
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# Drop missing/duplicates
df = df.dropna(subset=["route_id", "route_type"]).drop_duplicates(subset=["route_id"])

# Remove visual columns
non_analytic = ["route_url", "route_color", "route_text_color"]
df = df.drop(columns=[c for c in non_analytic if c in df.columns])

# Save
df.to_csv("routes_cleaned.csv", index=False)
print(f"Routes processed. Row count: {len(df)}")

In [None]:
# Trips Data Preprocessing

df = pd.read_csv("trips.csv")
df.columns = df.columns.str.strip().str.lower()

# Type casting
for col in ["route_id", "service_id", "trip_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str)

if "direction_id" in df.columns:
    df["direction_id"] = df["direction_id"].astype("category")

# Drop invalid records
df = df.dropna(subset=["trip_id", "route_id"]).drop_duplicates(subset=["trip_id"])

# Final cleanup
non_essential = ["wheelchair_accessible", "bikes_allowed"]
df = df.drop(columns=[c for c in non_essential if c in df.columns])

# Relationship validation (assert)
assert df["route_id"].notnull().all()

# Save
df.to_csv("trips_cleaned.csv", index=False)
print(f"Trips processed. Row count: {len(df)}")

In [None]:
# Stop Times Data Preprocessing

df = pd.read_csv("stop_times.csv")
df.columns = df.columns.str.strip().str.lower()

for col in ["trip_id", "stop_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str)

df["stop_sequence"] = pd.to_numeric(df["stop_sequence"], errors="coerce")

# Time conversion and segment travel time
time_cols = ["arrival_time", "departure_time"]
for col in time_cols:
    if col in df.columns:
        df[f"{col}_sec"] = df[col].astype(str).apply(time_to_seconds)

df["segment_travel_time"] = df["departure_time_sec"] - df["arrival_time_sec"]
df.loc[df["segment_travel_time"] < 0, "segment_travel_time"] = None

# Mandatory cleanup
df = df.dropna(subset=["trip_id", "stop_id", "stop_sequence"])
df = df.drop_duplicates(subset=["trip_id", "stop_sequence"])

# Drop original time strings
df = df.drop(columns=[c for c in time_cols if c in df.columns])

# Save
df.to_csv("stop_times_cleaned.csv", index=False)
print(f"Stop Times processed. Row count: {len(df)}")