### Configurations

In [11]:

# Define the path to the GTFS directory
# original_gtfs_dir = '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018-20250626T135819Z-1-001/GTFS_Recap_-_Fall_2018' # Assuming the GTFS files are in this directory
original_gtfs_dir = '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018' # Assuming the GTFS files are in this directory

new_gtfs_dir = '../4--Updated_GTFS_Files' # Directory where the updated GTFS files will be saved

# Copy Old GTFS to Updated GTFS Directory

In [12]:
import pandas as pd
import os
import shutil # Import the shutil module for high-level file operations

try:
    # --- Step 1: Copy the entire original GTFS directory to the new location ---
    if os.path.exists(new_gtfs_dir):
        print(f"Warning: '{new_gtfs_dir}' already exists. Deleting its contents to ensure a fresh copy.")
        shutil.rmtree(new_gtfs_dir) # Remove the directory and its contents

    print(f"Copying '{original_gtfs_dir}' to '{new_gtfs_dir}'...")
    shutil.copytree(original_gtfs_dir, new_gtfs_dir)
    print("Copy complete.")

    # Now, all subsequent operations will be on the files in new_gtfs_dir

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018' directory exists and contains 'calendar.txt' (and optionally 'calendar_dates.txt').")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Copying '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018' to '../4--Updated_GTFS_Files'...
Copy complete.


## Replace STOP ids in updated GTFS from MBTA_Stopid_lookup_updated.csv

In [13]:
# Define CSV paths (relative to the current directory)
matched_files_csv = os.path.join("..", "2--Stopid_Processor", "matched_files_with_stop_id_columns.csv")
lookup_csv = os.path.join("..", "2--Stopid_Processor", "MBTA_Stopid_lookup_updated.csv")

# Read the CSVs if not already loaded
matched_files_df = pd.read_csv(matched_files_csv)
print(f"Loaded '{matched_files_csv}' with {len(matched_files_df)} entries.")

lookup_df = pd.read_csv(lookup_csv)
print(f"Loaded '{lookup_csv}' with {len(lookup_df)} entries.")

# Create mapping dictionary from lookup_df using 'stop_id' and 'stop_id_update'
if 'stop_id' in lookup_df.columns and 'stop_id_update' in lookup_df.columns:
    mapping_dict = dict(zip(lookup_df['stop_id'], lookup_df['stop_id_update']))
else:
    mapping_dict = dict(zip(lookup_df.iloc[:, 1], lookup_df.iloc[:, 5]))

# Log list to accumulate summary info.
log_lines = []
log_lines.append("Stop ID Update Log\n")
log_lines.append("="*60 + "\n")

# Process each file listed in the matched files CSV.
# matched_files_df is assumed to have columns 'file_name' and 'stop_id_columns'
for _, row in matched_files_df.iterrows():
    file_name = row['file_name']
    # Split the stop id column names in case multiple are provided as comma-separated
    stop_id_columns = [col.strip() for col in row['stop_id_columns'].split(',')]
    file_path = os.path.join(new_gtfs_dir, file_name)
    
    if not os.path.exists(file_path):
        log_lines.append(f"File '{file_path}' not found. Skipping.\n")
        continue
    
    # Correct the variable name: use file_path instead of non-existent file_canpath.
    df = pd.read_csv(file_path)
    file_updated = False
    log_lines.append(f"\nProcessing file: {file_name}\n")
    for col in stop_id_columns:
        if col in df.columns:
            # Convert the values to string and update using mapping_dict
            original_col = df[col].astype(str)
            updated_col = original_col.map(mapping_dict).fillna(original_col)
            # Create a boolean mask for rows where the stop id was changed.
            mask = original_col != updated_col
            changed_count = mask.sum()
            if changed_count > 0:
                file_updated = True
                # Create a summary of unique changes.
                changes = pd.DataFrame({'old': original_col[mask], 'new': updated_col[mask]})
                unique_changes = changes.drop_duplicates().sort_values(by='old')
                log_lines.append(f"In column '{col}': {changed_count} rows changed.\n")
                log_lines.append(unique_changes.to_string(index=False) + "\n")
            else:
                log_lines.append(f"In column '{col}': no changes made.\n")
            
            # Update the dataframe column.
            df[col] = updated_col
        else:
            log_lines.append(f"Column '{col}' not found in '{file_name}'.\n")
    
    if file_updated:
        df.to_csv(file_path, index=False)
        log_lines.append(f"File '{file_name}' updated and saved successfully.\n")
    else:
        log_lines.append(f"No updates made to '{file_name}'.\n")

# Write the log file in the current directory.
log_file_path = os.path.join(".", "stopid_update_log.txt")
with open(log_file_path, "w") as f:
    f.writelines(log_lines)
print(f"Stop ID update log created at '{log_file_path}'.")


Loaded '..\2--Stopid_Processor\matched_files_with_stop_id_columns.csv' with 5 entries.
Loaded '..\2--Stopid_Processor\MBTA_Stopid_lookup_updated.csv' with 10904 entries.


  exec(code_obj, self.user_global_ns, self.user_ns)


Stop ID update log created at '.\stopid_update_log.txt'.


## Update 2018 GTFS year to match Scenario GTFS date/year

In [14]:
print(f"Attempting to prepare and update GTFS calendar files from '{original_gtfs_dir}' for '{new_gtfs_dir}'.\n")

# --- Update calendar.txt ---
# We now operate on the files in the new_gtfs_dir
calendar_path = os.path.join(new_gtfs_dir, 'calendar.txt')

if not os.path.exists(calendar_path):
    raise FileNotFoundError(f"'{calendar_path}' not found in the new directory. Copy might have failed or original is missing.")

calendar_df = pd.read_csv(calendar_path)
print(f"Loaded calendar.txt from '{new_gtfs_dir}' with {len(calendar_df)} entries.")

# Function to update the year in a date string
def update_year_to_2024(date_str):
    if pd.isna(date_str):
        return date_str
    date_str = str(date_str) # Ensure it's a string
    if len(date_str) == 8 and date_str.isdigit():
        # Assuming YYYYMMDD format
        original_year = date_str[:4]
        if original_year == '2018': # Only replace if it's 2018
            return '2024' + date_str[4:]
    return date_str

# Apply the update function to start_date and end_date columns
calendar_df['start_date'] = calendar_df['start_date'].apply(update_year_to_2024)
calendar_df['end_date'] = calendar_df['end_date'].apply(update_year_to_2024)

# Save the updated calendar.txt back to the new directory (overwriting the copied one)
calendar_df.to_csv(calendar_path, index=False)
print(f"Updated calendar.txt saved to '{calendar_path}'.")

# --- Update calendar_dates.txt ---
calendar_dates_path = os.path.join(new_gtfs_dir, 'calendar_dates.txt')

if os.path.exists(calendar_dates_path):
    calendar_dates_df = pd.read_csv(calendar_dates_path)
    print(f"Loaded calendar_dates.txt from '{new_gtfs_dir}' with {len(calendar_dates_df)} entries.")

    # Apply the update function to the 'date' column
    calendar_dates_df['date'] = calendar_dates_df['date'].apply(update_year_to_2024)

    # Save the updated calendar_dates.txt back to the new directory (overwriting the copied one)
    calendar_dates_df.to_csv(calendar_dates_path, index=False)
    print(f"Updated calendar_dates.txt saved to '{calendar_dates_path}'.")
else:
    print(f"'{calendar_dates_path}' not found in the new directory. Skipping update for this file (it's optional).")

print(f"\nGTFS calendar files successfully updated to 2024 and saved in the '{new_gtfs_dir}' directory.")
print("Your original GTFS files in 'MBTA_2018' remain unchanged.")

Attempting to prepare and update GTFS calendar files from '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018' for '../4--Updated_GTFS_Files'.

Loaded calendar.txt from '../4--Updated_GTFS_Files' with 119 entries.
Updated calendar.txt saved to '../4--Updated_GTFS_Files\calendar.txt'.
Loaded calendar_dates.txt from '../4--Updated_GTFS_Files' with 618 entries.
Updated calendar_dates.txt saved to '../4--Updated_GTFS_Files\calendar_dates.txt'.

GTFS calendar files successfully updated to 2024 and saved in the '../4--Updated_GTFS_Files' directory.
Your original GTFS files in 'MBTA_2018' remain unchanged.


### Identify in stops.txt, identify and remove stops with missing Lat Long

In [15]:
import pandas as pd
import os

print(f"Attempting to remove stops with missing latitude/longitude from stops.txt in '{new_gtfs_dir}'.\n")

try:
    # Define the path to stops.txt
    stops_path = os.path.join(new_gtfs_dir, 'stops.txt')

    if not os.path.exists(stops_path):
        raise FileNotFoundError(f"'{stops_path}' not found. Please ensure the GTFS files are in the correct directory.")

    # Load stops.txt
    stops_df = pd.read_csv(stops_path)
    print(f"Loaded stops.txt with {len(stops_df)} entries.")

    # Identify rows where 'stop_lat' or 'stop_lon' are NaN (missing)
    # The .isna() method checks for NaN values.
    # We use | (OR) to find rows where EITHER lat OR lon is missing.
    stops_with_missing_latlong = stops_df[stops_df['stop_lat'].isna() | stops_df['stop_lon'].isna()]
    num_missing = len(stops_with_missing_latlong)

    if num_missing > 0:
        print(f"\nFound {num_missing} stops with missing latitude or longitude.")
        print("These stops will be removed:")
        # Displaying a sample of the stops to be removed
        print(stops_with_missing_latlong[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']].head())

        # Remove rows where 'stop_lat' or 'stop_lon' are NaN
        # The ~ (NOT) operator inverts the boolean mask, keeping rows where NEITHER is NaN.
        cleaned_stops_df = stops_df.dropna(subset=['stop_lat', 'stop_lon'])
        num_kept = len(cleaned_stops_df)

        print(f"\nRemoved {num_missing} rows. Keeping {num_kept} rows.")

        # Overwrite the original stops.txt with the cleaned DataFrame
        cleaned_stops_df.to_csv(stops_path, index=False)
        print(f"Cleaned stops.txt saved successfully, overwriting the original file at '{stops_path}'.")
    else:
        print("\nNo stops with missing latitude or longitude were found. No changes made to stops.txt.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'stops.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



Attempting to remove stops with missing latitude/longitude from stops.txt in '../4--Updated_GTFS_Files'.

Loaded stops.txt with 9237 entries.

Found 588 stops with missing latitude or longitude.
These stops will be removed:
     stop_id stop_name  stop_lat  stop_lon
7800   C1260    Andrew       NaN       NaN
7801   C1261    Andrew       NaN       NaN
7802   C1271    Andrew       NaN       NaN
7803   C1343    Andrew       NaN       NaN
7804   C1344    Andrew       NaN       NaN

Removed 588 rows. Keeping 8649 rows.
Cleaned stops.txt saved successfully, overwriting the original file at '../4--Updated_GTFS_Files\stops.txt'.
