In [16]:
from gtfslite import GTFS
import pandas as pd
from os import listdir

# conclusion

The files for: 
- Buses EMT
- Metro
- Metro Ligero

are workable and correct.  
The files for:
- Buses Interurbanos
- Buses Urbanos

are split into multiple zip files. These two contain identical data and both contain both interurbanos and urbanos (not EMT)

The file for:
- Cercanias  
is incomplete **fixed by getting info from renfe**

# Verifying each component

For the project we need GTFS data for all of the parts of Madrid transit. i.e.:

- buses EMT
- buses Urbanos no-EMT
- buses Interurbanos
- Metro
- Metro Ligero
- Cercanias

## Buses EMT

In [None]:
busesEMT = GTFS.load_zip('./busesEMT_9may2025') 

busesEMT.summary()

agencies            [Empresa Municipal de Transportes]
total_stops                                       4883
total_routes                                       238
total_trips                                      76150
total_stops_made                               1930926
first_date                                    20250726
last_date                                     20251231
total_shapes                                    111133
dtype: object

## Buses Urbanos and Interurbanos

The zip files downloaded from CRTM for each of these are identical and contain many zip files inside that are each a separate GTFS file for distinct urbano nonEMT and interurbano bus routes

In [18]:
busesInterurbanosYUrbanos = [GTFS.load_zip('./busesInterurbanos_Urbanos_all_16july2025/' + i ) for i in listdir('./busesInterurbanos_Urbanos_all_16july2025/')]
busesInterurbanosYUrbanos

[<gtfslite.gtfs.GTFS at 0x7fca1cd43680>,
 <gtfslite.gtfs.GTFS at 0x7fc9fb80fcb0>,
 <gtfslite.gtfs.GTFS at 0x7fc9fb8f9490>,
 <gtfslite.gtfs.GTFS at 0x7fc9fb18cce0>,
 <gtfslite.gtfs.GTFS at 0x7fc9fab9b5f0>,
 <gtfslite.gtfs.GTFS at 0x7fca008e77d0>,
 <gtfslite.gtfs.GTFS at 0x7fc9fa76f770>,
 <gtfslite.gtfs.GTFS at 0x7fc9fa5f8680>,
 <gtfslite.gtfs.GTFS at 0x7fc9f20d77d0>,
 <gtfslite.gtfs.GTFS at 0x7fc9f14f5a30>,
 <gtfslite.gtfs.GTFS at 0x7fc9f129e810>,
 <gtfslite.gtfs.GTFS at 0x7fc9f0c2ffb0>,
 <gtfslite.gtfs.GTFS at 0x7fc9f0c48b90>,
 <gtfslite.gtfs.GTFS at 0x7fc9f0b97b60>,
 <gtfslite.gtfs.GTFS at 0x7fc9f060d0d0>,
 <gtfslite.gtfs.GTFS at 0x7fc9eefe39e0>,
 <gtfslite.gtfs.GTFS at 0x7fc9ee5e4800>,
 <gtfslite.gtfs.GTFS at 0x7fc9ee21bf50>,
 <gtfslite.gtfs.GTFS at 0x7fc9edf9fda0>,
 <gtfslite.gtfs.GTFS at 0x7fc9ede2cef0>,
 <gtfslite.gtfs.GTFS at 0x7fc9ed2fbf50>,
 <gtfslite.gtfs.GTFS at 0x7fc9ecb5be30>,
 <gtfslite.gtfs.GTFS at 0x7fc9edf9d6d0>]

## Cercanias

In [None]:
cercanias = GTFS.load_zip('./cercanias_23july2025_fromRenfe')
cercanias.summary()

agencies            [Renfe Cercanias]
total_stops                      1147
total_routes                      477
total_trips                    122303
total_stops_made              1730491
first_date                   20250722
last_date                    20250821
dtype: object

need to make the change indicated below to the gtfs.py file in gtfs-lite to read the cercanias from renfe file, otherwise it raises an exception when loading the calendar file due to trying to use `.str` on a non-string object 

In [None]:
def _load_clean_feed(filepath, optional=False, dtype=None, **pandas_kwargs):
    """Load a feed cleanly by stripping column names.

    Loads a feed. If the feed is empty (produces an empty dataframe) and the
    item is optional, a None is returned, otherwise an error is raised.

    Keyword arguments can be passd also to make parsing easier.

    Parameters
    ----------
    filepath : str
        path to the file

    Returns
    -------
    pd.DataFrame or None
        A dataframe that is loaded.
    """
    try:
        df = pd.read_csv(filepath, dtype=dtype, **pandas_kwargs)
        df.columns = df.columns.str.strip()
        if df.empty:
            if optional:
                return None
            else:
                raise pd.errors.EmptyDataError("This file is empty")
        # Strip all column whitespace on load
        if dtype is not None:
            for c in df.columns:
                try:
                    if dtype[c] is str:
                        df[c] = df[c].astype(str).str.strip() # THIS CHANGE RIGHT HERE !!!!!!
                except KeyError:
                    pass
        return df
    except pd.errors.EmptyDataError:
        if optional:
            return None
        else:
            raise

_load_clean_feed('./cercanias_unzipped/calendar.txt',
                 dtype={
                        "service_id": str,
                        "monday": int,
                        "tuesday": int,
                        "wednesday": int,
                        "thursday": int,
                        "friday": int,
                        "saturday": int,
                        "sunday": int,
                        "start_date": str,
                        "end_date": str,
                    })

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,PR32M,0,1,0,0,0,0,0,20250722,20250722
1,PR33X,0,0,1,0,0,0,0,20250723,20250723
2,PR34J,0,0,0,1,0,0,0,20250724,20250724
3,PR35V,0,0,0,0,1,0,0,20250725,20250725
4,PR36S,0,0,0,0,0,1,0,20250726,20250726
...,...,...,...,...,...,...,...,...,...,...
398,7058D,0,0,0,0,0,0,1,20250817,20250817
399,7059L,1,0,0,0,0,0,0,20250818,20250818
400,7060M,0,1,0,0,0,0,0,20250819,20250819
401,7061X,0,0,1,0,0,0,0,20250820,20250820


## Metro

In [7]:
metro = GTFS.load_zip('./metro_30may2025')
metro.summary()

agencies            [Consorcio Regional de Transportes de Madrid]
total_stops                                                  1050
total_routes                                                   13
total_trips                                                   120
total_stops_made                                             2216
first_date                                               20250101
last_date                                                20260527
total_shapes                                                57474
dtype: object

## Metro ligero

In [8]:
metroLigero = GTFS.load_zip('./metroLigero_26feb2025')
metroLigero.summary()

agencies            [Consorcio Regional de Transportes de Madrid]
total_stops                                                    73
total_routes                                                    4
total_trips                                                  3001
total_stops_made                                            38983
first_date                                               20250101
last_date                                                20260220
total_shapes                                                 4342
dtype: object