In [3]:
import duckdb
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
import os
from pathlib import Path
import geopandas as gpd
import xml.etree.ElementTree as ET
import re


In [4]:
tree = ET.parse('RSS.xml')
root = tree.getroot()

print(tree)
items = []
for item in root.findall('./channel/item'):
    title = item.find('title').text.strip() if item.find('title') is not None else ""
    link = item.find('link').text.strip() if item.find('link') is not None else ""
    pub_date_raw = item.find('pubDate').text.strip() if item.find('pubDate') is not None else ""
    
    # Convertir fecha de publicación
    try:
        pub_date = datetime.strptime(pub_date_raw, "%a, %d %b %Y %H:%M:%S %Z")
    except ValueError:
        pub_date = None

    filename = link.split('/')[-1]
    lower_link = link.lower()
    lower_filename = filename.lower()

    # --- Lógica de Extracción de Metadatos ---

    # A) Categoría Principal (estudios_completos, estudios_basicos, etc.)
    main_category = "Otros"
    if "estudios_completos" in lower_link:
        main_category = "Estudios Completos"
    elif "estudios_basicos" in lower_link:
        main_category = "Estudios Basicos"
    elif "estudios_rutas" in lower_link:
        main_category = "Estudios de Rutas"
    elif "zonificacion" in lower_link:
        main_category = "Zonificacion"

    # B) Tipo de Estudio (viajes, etapas, etc.)
    study_type = "Desconocido"
    if "viajes" in lower_filename:
        study_type = "Viajes"
    elif "etapas" in lower_filename:
        if "_c" in lower_filename or "carretera" in lower_link:
            study_type = "Etapas (Carretera)"
        else:
            study_type = "Etapas"
    elif "pernoctaciones" in lower_filename:
        study_type = "Pernoctaciones"
    elif "personas" in lower_filename:
        study_type = "Personas"
    elif "frecuencia" in lower_link or "frecuencia" in lower_filename:
        study_type = "Frecuencia"
    elif "calidad" in lower_link or "descartados" in lower_filename:
        study_type = "Calidad"
    elif "od_rutas" in lower_filename:
        study_type = "Matriz OD Rutas"
    elif "relaciones_tramos" in lower_filename:
        study_type = "Relaciones Tramos-Rutas"
    elif "tramos_info" in lower_filename:
        study_type = "Info Tramos OD"
    elif "zonificacion" in lower_link:
        study_type = "Geometria/Zonificacion"
    elif "agregados" in lower_filename:
        study_type = "Datos Agregados"

    # C) Zona (municipios, distritos, GAU)
    zone_type = "N/A" # Por defecto
    if "municipios" in lower_link or "municipios" in lower_filename:
        zone_type = "Municipios"
    elif "distritos" in lower_link or "distritos" in lower_filename:
        zone_type = "Distritos"
    elif "gau" in lower_link or "gau" in lower_filename:
        zone_type = "GAU"
    elif "rutas" in lower_link:
        zone_type = "Rutas"
    
    # D) Extracción de Fecha (YYYY, MM, DD) desde el nombre del archivo
    # Buscamos patrones como 20240101 (diario) o 202401 (mensual)
    year, month, day = None, None, None
    
    # Regex para YYYYMMDD
    date_match_daily = re.search(r'(\d{4})(\d{2})(\d{2})', filename)
    # Regex para YYYYMM (archivos mensuales o tar)
    date_match_monthly = re.search(r'(\d{4})(\d{2})', filename)
    
    if date_match_daily:
        year = int(date_match_daily.group(1))
        month = int(date_match_daily.group(2))
        day = int(date_match_daily.group(3))
    elif date_match_monthly:
        year = int(date_match_monthly.group(1))
        month = int(date_match_monthly.group(2))
        # day se queda como None para mensuales

    items.append({
        "main_category": main_category, # Nueva columna solicitada
        "study_type": study_type,
        "zone_type": zone_type,
        "year": year,
        "month": month,
        "day": day,
        "publication_date": pub_date,
        "filename": filename,
        "source_url": link
    })
df_catalog = pd.DataFrame(items)

<xml.etree.ElementTree.ElementTree object at 0x0000022D65B0EED0>


In [1]:
import duckdb
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
import os
from pathlib import Path
import geopandas as gpd
import xml.etree.ElementTree as ET
import re
from pathlib import Path


In [2]:
con = duckdb.connect()
con.sql("INSTALL ducklake; LOAD ducklake;")
con.sql("INSTALL spatial; LOAD spatial;")

In [None]:
# This for Detaching from ducklake in case
con.sql(f"""
USE memory;
DETACH my_ducklake;
    """)

BinderException: Binder Error: Failed to detach database with name "my_ducklake": database not found

In [None]:
# Ataching to local duck lake
con.sql(f"""
ATTACH 'ducklake:my_ducklake.ducklake' AS my_ducklake;

USE my_ducklake;
    """)

In [5]:
con.sql("CREATE SCHEMA IF NOT EXISTS bronze")

In [None]:
# Inserting the catalog from where we will get the urls for the data
con.sql("CREATE OR REPLACE TABLE bronze.catalog AS SELECT * FROM df_catalog")

In [6]:
files_df = con.sql("""
    SELECT *
    FROM bronze.catalog 
   
""").df()
files_df

Unnamed: 0,main_category,study_type,zone_type,year,month,day,publication_date,filename,source_url
0,Estudios Completos,Viajes,,2023.0,12.0,17.0,2025-11-18 12:15:51,20231217_viajes.csv.gz,https://movilidad-opendata.mitma.es/estudios_c...
1,Estudios Completos,Viajes,,2023.0,12.0,16.0,2025-11-18 12:15:50,20231216_viajes.csv.gz,https://movilidad-opendata.mitma.es/estudios_c...
2,Estudios Completos,Viajes,,2023.0,12.0,13.0,2025-11-18 12:15:46,20231213_viajes.csv.gz,https://movilidad-opendata.mitma.es/estudios_c...
3,Estudios Completos,Viajes,,2023.0,12.0,14.0,2025-11-18 12:15:46,20231214_viajes.csv.gz,https://movilidad-opendata.mitma.es/estudios_c...
4,Estudios Completos,Viajes,,2023.0,12.0,15.0,2025-11-18 12:15:46,20231215_viajes.csv.gz,https://movilidad-opendata.mitma.es/estudios_c...
...,...,...,...,...,...,...,...,...,...
13378,Estudios Basicos,Desconocido,Distritos,,,,2022-12-20 15:07:40,,https://movilidad-opendata.mitma.es/estudios_b...
13379,Estudios Basicos,Calidad,,,,,2022-12-20 15:00:05,,https://movilidad-opendata.mitma.es/estudios_b...
13380,Estudios Basicos,Calidad,,,,,2022-12-20 14:59:42,,https://movilidad-opendata.mitma.es/estudios_b...
13381,Estudios Basicos,Desconocido,,,,,2022-12-20 14:59:32,,https://movilidad-opendata.mitma.es/estudios_b...


In [7]:
con.sql("""
    SELECT * FROM bronze.trips WHERE part_month = '2'
""")

┌──────────┬─────────┬───────────┬────────────────┬──────────┬─────────────────┬──────────────────────┬──────────────────┬───────────────────────┬───────────┬─────────┬─────────┬─────────┬─────────┬───────────────────────┬───────────┬────────────┬───────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│   date   │ period  │ id_origin │ id_destination │ distance │ activity_origin │ activity_destination │ study_origin_pos │ study_destination_pos │ residence │  rent   │   age   │   sex   │ n_trips │ trips_total_length_km │ part_year │ part_month │ zone_type │ source  │                                                             source_url                                                              │       ingestion_date       │
│ varchar  │ varchar │  varchar  │    varchar     │ varchar  │     varchar     │       varchar        │     varchar      │        varc

In [27]:
duckdb.sql(f"""
   FROM glob('my_ducklake.ducklake.files/**/*');
    FROM 'my_ducklake.ducklake.files/**/*.parquet' LIMIT 10;    
    """)

┌──────────┬──────────────────────┬───────────┬─────────┬──────────────────────┬──────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [46]:
con.sql("""
    SELECT * FROM bronze.catalog
""")

┌────────────────────┬────────────┬───────────┬────────┬────────┬────────┬─────────────────────┬────────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│   main_category    │ study_type │ zone_type │  year  │ month  │  day   │  publication_date   │                filename                │                                                                 source_url                                                                  │
│      varchar       │  varchar   │  varchar  │ double │ double │ double │    timestamp_ns     │                varchar                 │                                                                   varchar                                                                   │
├────────────────────┼────────────┼───────────┼────────┼────────┼────────┼─────────────────────┼────────────────────────────────────────┼───────────────────────

In [None]:
# TABLA viajes Bronze 
con.sql("""
    CREATE OR REPLACE TABLE bronze.trips (
        date VARCHAR,
        period VARCHAR,
        id_origin VARCHAR,
        id_destination VARCHAR,
        distance VARCHAR,
        activity_origin VARCHAR,
        activity_destination VARCHAR,
        study_origin_pos VARCHAR,
        study_destination_pos VARCHAR,
        residence VARCHAR,
        rent VARCHAR,
        age VARCHAR,
        sex VARCHAR,
        n_trips VARCHAR,
        trips_total_length_km VARCHAR,
        
        part_year VARCHAR,
        part_month VARCHAR,
        zone_type VARCHAR,
        source VARCHAR,
        source_url VARCHAR,
        ingestion_date TIMESTAMP
    );
""")
# con.sql("ALTER TABLE bronze.trips SET PARTITIONED BY (zone_type, source, part_year, part_month)")


In [None]:
# Funcion para ingerir datos viajes

def cargar_viajes_por_criterio(con, year, month, zone_type):
    # zone_type format : 'Municipio' 'Distritos' 'GAUS'

    # saca urls del catalogo
    files_df = con.sql(f"""
        SELECT source_url 
        FROM bronze.catalog 
        WHERE year = {year}
        AND month = {month}
        AND zone_type = '{zone_type}'
        AND main_category = 'Estudios Basicos'
        AND study_type = 'Viajes'
        AND filename LIKE '%.csv.gz'
    """).df()

    urls = files_df['source_url'].tolist()
    print(urls)
    #Insertar a la tabla 
    con.sql(f"""INSERT INTO bronze.trips
            SELECT 
                        -- 1. Transformación de Fecha (Corrección del VARCHAR '20230131')
                        fecha as date,
                        periodo as period,
                        origen as id_origin,
                        destino as id_destination ,
                        distancia as distance,
                        actividad_origen as activity_origin,
                        actividad_destino as activity_destination,
                        estudio_origen_posible as study_origin_pos,
                        estudio_destino_posible as study_destination_pos,
                        residencia as residence,
                        renta as rent,
                        edad as age,
                        sexo as sex,
                        viajes as n_trips,
                        viajes_km as trips_total_length_km,
                        -- 2. Metadatos (Tus columnas personalizadas)
                        {year} as part_year,
                        {month} as part_month,
                        '{zone_type}' as zone_type,  -- Valor fijo pasado por parámetro
                        'MITMA' as source,
                        filename as source_url,      -- Columna mágica generada por read_csv(..., filename=True)
                        current_timestamp as ingestion_date
                        
                    FROM read_csv(
                        {urls}, 
                        delim='|', 
                        header=True, 
                        filename=True,
                        union_by_name=True,
                        null_padding=True,
                        ignore_errors=True,
                        all_varchar=True -- Leemos todo como texto para evitar fallos de tipo antes de castear
                    )""")

In [49]:
cargar_viajes_por_criterio(con, 2023, 2, "Distritos")

['https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230228_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230227_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230226_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230225_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230224_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230223_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-02/20230222_Viajes_distritos.csv.gz', 'https://movilidad-opendata.mitma.es/estudios_basicos/

In [None]:

con.sql("""
    SELECT * FROM bronze.trips WHERE zone_type = 'Distritos' AND date = '20230101'  and period = '00' and id_origin = '38039' -- and activity_origin = 'casa' and activity_destination = 'frecuente' and id_destination = '24122_AM'
    ORDER BY n_trips DESC

""")

┌──────────┬─────────┬───────────┬────────────────┬──────────┬─────────────────┬──────────────────────┬──────────────────┬───────────────────────┬───────────┬─────────┬─────────┬─────────┬─────────┬───────────────────────┬───────────┬────────────┬───────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│   date   │ period  │ id_origin │ id_destination │ distance │ activity_origin │ activity_destination │ study_origin_pos │ study_destination_pos │ residence │  rent   │   age   │   sex   │ n_trips │ trips_total_length_km │ part_year │ part_month │ zone_type │ source  │                                                             source_url                                                              │       ingestion_date       │
│ varchar  │ varchar │  varchar  │    varchar     │ varchar  │     varchar     │       varchar        │     varchar      │        varc

In [8]:
con.sql("""
    SELECT * FROM bronze.trips WHERE part_month = '2'

""")

┌──────────┬─────────┬───────────┬────────────────┬──────────┬─────────────────┬──────────────────────┬──────────────────┬───────────────────────┬───────────┬─────────┬─────────┬─────────┬─────────┬───────────────────────┬───────────┬────────────┬───────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│   date   │ period  │ id_origin │ id_destination │ distance │ activity_origin │ activity_destination │ study_origin_pos │ study_destination_pos │ residence │  rent   │   age   │   sex   │ n_trips │ trips_total_length_km │ part_year │ part_month │ zone_type │ source  │                                                             source_url                                                              │       ingestion_date       │
│ varchar  │ varchar │  varchar  │    varchar     │ varchar  │     varchar     │       varchar        │     varchar      │        varc

In [None]:
def cargar_info_zona(con,zone_type):
    # zone_type = 'municipios','distritos','gaus'
    #Find in catalog zone type files
    zone_dic = {"municipios":"municiples","distritos":"districts","gaus":"GAUS"}
    # Crea tabla de info sobre las zonas
    con.sql(f"""
    CREATE OR REPLACE TABLE bronze.{zone_dic[zone_type]}_info (
        id_{zone_dic[zone_type]} VARCHAR,
        name_{zone_dic[zone_type]} VARCHAR,
        zone_type VARCHAR,
        source VARCHAR,
        source_url VARCHAR,
        ingestion_date TIMESTAMP,
        geometry GEOMETRY ,
        centroid GEOMETRY ,
            );
        """)
    con.sql(f"ALTER TABLE bronze.{zone_dic[zone_type]}_info SET PARTITIONED BY (zone_type)")
    
    file_info = con.sql(f"""
    SELECT source_url, filename 
    FROM bronze.catalog 
    WHERE (main_category = 'Zonificacion' OR main_category = 'Otros')
      AND (filename ILIKE '%{zone_type}%') 
      AND (filename ILIKE '%.shp' OR filename ILIKE '%.shx' OR filename ILIKE '%.dbf' OR filename ILIKE '%.prj' OR filename ILIKE '%.csv')
      --AND filename NOT ILIKE '%centroides%'
        """).df()
    print(len(file_info))
    #temp dowload data
    urls = file_info["source_url"]
    os.makedirs(f"temp_downloads/{zone_type}", exist_ok=True)
    for key, url in urls.items():
        filename = url.split('/')[-1]
        path = f"temp_downloads/{zone_type}/{filename}"
        # Descargamos solo si no existe para ahorrar tiempo
        if not os.path.exists(path):
            print(f"Descargando {filename}...")
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)




    shp_path = f"temp_downloads/{zone_type}/zonificacion_{zone_type}.shp"
    shp_centroid_path = f"temp_downloads/{zone_type}/zonificacion_{zone_type}_centroides.shp"
    csv_path = f"temp_downloads/{zone_type}/nombres_{zone_type}.csv"
    output_parquet = f"datalake/bronze/zonificacion/{zone_dic[zone_type]}.parquet"
    os.makedirs("datalake/bronze/zonificacion", exist_ok=True)
    #Join tables into zonetype info table
    
    con.sql(f"""
            INSERT INTO bronze.{zone_dic[zone_type]}_info
                SELECT 
                    -- IDs y Nombres (Usamos el diccionario para acertar el nombre de columna)
                    CAST(t1.ID AS VARCHAR) as id_{zone_dic[zone_type]},
                    t2.name as name_{zone_dic[zone_type]},
                    
                    -- Geometría (Se guardará como binario WKB automáticamente)

                    '{zone_dic[zone_type]}' as zone_type,
                    'MITMA' as source,
                    '{urls}' as source_url,      -- Columna mágica generada por read_csv(..., filename=True)
                    current_timestamp as ingestion_date,
                    t1.geom as geometry,
                    t3.geom as centroid
                FROM st_read('{shp_path}') t1
                -- Left Join con el CSV de nombres
                LEFT JOIN read_csv('{csv_path}', delim='|', header=True, auto_detect=True,filename=True) t2 
                ON CAST(t1.ID AS VARCHAR) = CAST(t2.ID AS VARCHAR)
                LEFT JOIN st_read('{shp_centroid_path}') t3 ON CAST(t1.ID AS VARCHAR) = CAST(t3.ID AS VARCHAR)
            
           """)
      

In [19]:
cargar_info_zona(con,"gaus")
cargar_info_zona(con,"municipios")
cargar_info_zona(con,"distritos")

10
10
10


In [18]:
con.sql("DROP TABLE bronze.GAUS_info ")
con.sql("DROP TABLE bronze.municiples_info ")
con.sql("DROP TABLE bronze.districts_info ")

In [20]:
con.sql("SELECT * FROM bronze.districts_info WHERE name_districts LIKE '%Ali%'").df()

Unnamed: 0,id_districts,name_districts,zone_type,source,source_url,ingestion_date,geometry,centroid
0,301401,Alicante/Alacant distrito 01,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 157, 99, 47, 73, 135,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,301402,Alicante/Alacant distrito 02,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 233, 133, 47, 73, 215...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,301403,Alicante/Alacant distrito 03,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 97, 178, 47, 73, 172,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,301404,Alicante/Alacant distrito 04,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[5, 4, 0, 0, 0, 0, 0, 0, 125, 161, 46, 73, 108...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,301405,Alicante/Alacant distrito 05,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 243, 87, 47, 73, 182,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
5,301406,Alicante/Alacant distrito 06,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 216, 180, 46, 73, 183...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
6,301407,Alicante/Alacant distrito 07,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[2, 4, 0, 0, 0, 0, 0, 0, 217, 146, 46, 73, 205...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
7,301408,Alicante/Alacant distrito 08,districts,MITMA,0 https://movilidad-opendata.mitma.es/zonif...,2025-11-26 14:24:53.243352,"[5, 4, 0, 0, 0, 0, 0, 0, 73, 10, 44, 73, 252, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."


In [125]:
con.sql("SELECT * FROM __ducklake_metadata_my_ducklake.ducklake_data_file")

┌──────────────┬──────────┬────────────────┬──────────────┬────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────┬─────────────┬──────────────┬─────────────────┬─────────────┬──────────────┬──────────────┬────────────────┬───────────────────┬────────────┐
│ data_file_id │ table_id │ begin_snapshot │ end_snapshot │ file_order │                                                        path                                                         │ path_is_relative │ file_format │ record_count │ file_size_bytes │ footer_size │ row_id_start │ partition_id │ encryption_key │ partial_file_info │ mapping_id │
│    int64     │  int64   │     int64      │    int64     │   int64    │                                                       varchar                                                       │     boolean      │   varchar   │    int64     │      int64      │    int64    │    int64     │    int64    

In [91]:
con.sql("FROM my_ducklake.snapshots();")

┌─────────────┬───────────────────────────────┬────────────────┬─────────────────────────────────────────────────────────────────┬─────────┬────────────────┬───────────────────┐
│ snapshot_id │         snapshot_time         │ schema_version │                             changes                             │ author  │ commit_message │ commit_extra_info │
│    int64    │   timestamp with time zone    │     int64      │                     map(varchar, varchar[])                     │ varchar │    varchar     │      varchar      │
├─────────────┼───────────────────────────────┼────────────────┼─────────────────────────────────────────────────────────────────┼─────────┼────────────────┼───────────────────┤
│           0 │ 2025-11-26 00:22:11.288769+01 │              0 │ {schemas_created=[main]}                                        │ NULL    │ NULL           │ NULL              │
│           1 │ 2025-11-26 00:22:14.81608+01  │              1 │ {schemas_created=[bronze]}                   