# Data Generation and Insertion for Transport Database

This notebook replaces the old SQL insert scripts with Python scripts for generating and inserting synthetic data into the database, using the new English schema. It uses Faker and pandas for data generation and manipulation.

In [None]:
import faker
import random
import os
import unicodedata

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_file = "01_insert_locations.sql"

# Number of records (PDF mentioned 13 "Zonas de Operación")
num_locations = 13

# --- Generate Locations (Zonas de Operación) ---
print(f"Generating {num_locations} locations (operational zones)...")

location_names_base = [
    "Usaquén", "Chapinero", "Santa Fe", "San Cristóbal", "Usme", "Tunjuelito",
    "Bosa", "Kennedy", "Fontibón", "Engativá", "Suba", "Barrios Unidos", "Teusaquillo"
]
if len(location_names_base) > num_locations:
    location_names_base = random.sample(location_names_base, num_locations)
elif len(location_names_base) < num_locations:
    for i in range(num_locations - len(location_names_base)):
        location_names_base.append(f"Zona Operativa Adicional {i+1}")

all_records_strings = []
for i in range(num_locations):
    location_id_val = i + 1 # Python controla el ID
    
    name_val = location_names_base[i].replace("'", "''")
    description_val = f"Zona de operación principal: {name_val} en Bogotá D.C.".replace("'", "''")
    
    all_records_strings.append(
        f"({location_id_val}, '{name_val}', '{description_val}')"
    )

# --- Write to SQL file ---
try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO locations (location_id, name, description) OVERRIDING SYSTEM VALUE VALUES\n")
        
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for locations generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 13 locations (operational zones)...
SQL script for locations generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/01_insert_locations.sql


In [6]:
import random
import os

# --- Configuration ---
# Guarda el .sql en el mismo directorio que este script .py
output_file = "02_insert_concessionaires.sql"

# --- Generate Concessionaires ---
print("Generating concessionaire records...")


concessionaire_data_definitions = [
    {"id": 1, "name": "Bogotá Móvil Operación Sur BMO SUR S.A.S", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 2, "name": "Connexion Móvil S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 3, "name": "Capitalbus S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 4, "name": "SI18 Calle 80 S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 5, "name": "SI18 Norte S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 6, "name": "SI18 Suba S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 7, "name": "Somos Bogotá Usme S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 8, "name": "Gmovil S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 9, "name": "Consorcio Express S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 10, "name": "Este Es Mi Bus S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 11, "name": "ETIB S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 12, "name": "Masivo Capital S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 13, "name": "Organización Suma S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 14, "name": "E-Somos Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 15, "name": "Mueve Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 16, "name": "ZMO Fontibón III S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 17, "name": "ZMO Fontíbón V S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 18, "name": "Emasivo 10 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 19, "name": "Emasivo 16 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 20, "name": "Operadora Distrital de Transporte La Rolita", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 21, "name": "E-Somos Alimentación S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 22, "name": "Gran Américas Usme S.A.S", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Asumido UCE
    {"id": 23, "name": "Mueve Usme S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Asumido UCE
    {"id": 24, "name": "Cable Movil de Bogota S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": False, "cable": True}, # Operador de Cable
    {"id": 25, "name": "Transportes Urbanos Integrados S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False}, # Genérico
    {"id": 26, "name": "Movilidad Estratégica del Oriente S.A.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False}, # Genérico
    {"id": 27, "name": "Conexión Capital S.P.A.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Genérico
]

all_records_strings = []
for data in concessionaire_data_definitions:
    concessionaire_id_val = data["id"] # Python controla el ID
    name_val = data["name"].replace("'", "''")
    troncal_val = data["troncal"]
    zonal_uce_val = data["zonal_uce"]
    zonal_alim_val = data["zonal_alimentacion"]
    cable_val = data["cable"]
    
    all_records_strings.append(
        f"({concessionaire_id_val}, '{name_val}', {troncal_val}, {zonal_uce_val}, {zonal_alim_val}, {cable_val})"
    )

# --- Write to SQL file ---
try:
    with open(output_file, 'w', encoding='utf-8') as file:
        # IMPORTANTE: Añadir OVERRIDING SYSTEM VALUE aquí
        file.write("INSERT INTO concessionaires (concessionaire_id, name, operates_troncal, operates_zonal_uce, operates_zonal_alimentacion, operates_cable) OVERRIDING SYSTEM VALUE VALUES\n")
        
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for concessionaires generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating concessionaire records...
SQL script for concessionaires generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/02_insert_concessionaires.sql


In [7]:
import faker
import random
import os

fake_co = faker.Faker('es_CO')

output_file = "03_insert_depots.sql"


max_location_id = 13
max_concessionaire_id = 27

print("Generating depot records...")

depot_specs = [
    {"type": "TALLER", "count": 13, "name_prefix": "Patio Taller Principal"},
    {"type": "TRANSITORIO", "count": 32, "name_prefix": "Patio Transitorio"},
    {"type": "ELECTRICO", "count": 9, "name_prefix": "ElectroPatio"},
    {"type": "BAJAS_EMISIONES", "count": 4, "name_prefix": "Patio Eco"}
]
total_depots_to_generate = sum(spec["count"] for spec in depot_specs) # 58 depots
depot_id_counter = 0
all_records_strings = []

for spec in depot_specs:
    for i in range(spec["count"]):
        depot_id_counter += 1
        depot_id_val = depot_id_counter # Python controla el ID

        zone_name_part = fake_co.city_suffix().replace("'", "''")
        name_val = f"{spec['name_prefix']} {zone_name_part} {i+1}".replace("'", "''")
        
        address_val = f"{random.choice(['Calle', 'Carrera', 'Avenida'])} {random.randint(1,200)} # {random.randint(1,150)}-{random.randint(1,99)}, {fake_co.city().replace("'", "''")}"
        depot_type_val = spec["type"]
        
        capacity_vehicles_val = random.randint(50, 300)
        if depot_type_val == "TALLER":
            capacity_vehicles_val = random.randint(150, 500)
        elif depot_type_val == "ELECTRICO":
            capacity_vehicles_val = random.randint(80, 250)
            
        location_id_val = random.randint(1, max_location_id) if max_location_id > 0 else "NULL"
        
        concessionaire_id_val_str = "NULL"
        if max_concessionaire_id > 0 and random.random() > 0.3: # 70% chance of being assigned to a concessionaire
            concessionaire_id_val_str = str(random.randint(1, max_concessionaire_id))

        all_records_strings.append(
            f"({depot_id_val}, '{name_val}', '{address_val}', '{depot_type_val}', "
            f"{capacity_vehicles_val}, {location_id_val}, {concessionaire_id_val_str})"
        )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO depots (depot_id, name, address, depot_type, capacity_vehicles, location_id, concessionaire_id) OVERRIDING SYSTEM VALUE VALUES\n")
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for depots generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating depot records...
SQL script for depots generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/03_insert_depots.sql


In [9]:
import faker
import random
import os
from datetime import timedelta, date

fake_co = faker.Faker('es_CO')
fake_generic = faker.Faker() # For more diverse names

output_file = "04_insert_users.sql"

# Data Scaling: Original target ~2.5M, scaled by 1/100
num_users_original_target = 2500000
num_users = num_users_original_target // 100  # Approx 25,000

print(f"Generating {num_users} scaled user records...")

genders_list = ['M', 'F', 'O']
all_records_strings = []
user_id_counter = 0

# To ensure unique emails and id_numbers with Faker when generating many records
# It's better to track them or use fake.unique within the loop.
# For this scale, direct unique calls should be mostly fine.

for i in range(num_users):
    user_id_counter += 1
    user_id_val = user_id_counter # Python controla el ID

    if random.random() < 0.85:
        first_name_val = fake_co.first_name().replace("'", "''")
        last_name_val = fake_co.last_name().replace("'", "''")
    else:
        first_name_val = fake_generic.first_name().replace("'", "''")
        last_name_val = fake_generic.last_name().replace("'", "''")
        
    contact_number_val = fake_co.phone_number()
    email_val = fake_co.unique.email().replace("'", "''")
    gender_val = random.choice(genders_list)
    
    birth_date_obj = fake_co.date_of_birth(minimum_age=16, maximum_age=85)
    date_of_birth_val = birth_date_obj.strftime('%Y-%m-%d')
    
    street_type = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
    street_number = random.randint(1, 200)
    part1 = random.randint(1, 150)
    part1_letter = random.choice(["", "A", "B", "C", "Bis"]) if random.random() > 0.5 else ""
    part2 = random.randint(1, 99)
    address_detail_num = f"{part1}{part1_letter} # {part2}-{random.randint(1,50)}"
    residential_address_val = f"{street_type} {street_number} {address_detail_num}, {fake_co.city().replace("'", "''")}".replace("'", "''")
    
    id_number_val = str(fake_co.unique.random_number(digits=10, fix_len=True))
    city_of_birth_val = fake_co.city().replace("'", "''")
    
    registration_date_obj = fake_co.date_between(start_date='-10y', end_date='today')
    registration_date_val = registration_date_obj.strftime('%Y-%m-%d')
    
    all_records_strings.append(
        f"({user_id_val}, '{first_name_val}', '{last_name_val}', '{contact_number_val}', '{email_val}', "
        f"'{gender_val}', '{date_of_birth_val}', '{residential_address_val}', '{id_number_val}', "
        f"'{city_of_birth_val}', '{registration_date_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO users (user_id, first_name, last_name, contact_number, email, gender, date_of_birth, residential_address, id_number, city_of_birth, registration_date) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000 # For larger datasets, batching INSERTs is good practice
        
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            # Logic for batching INSERT statements for performance if num_users was very large
            # For 25k, a single VALUES clause is fine, but this handles future scaling.
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO users (user_id, first_name, last_name, contact_number, email, gender, date_of_birth, residential_address, id_number, city_of_birth, registration_date) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for users generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 25000 scaled user records...
SQL script for users generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/04_insert_users.sql


In [10]:
# Nombre del archivo: generate_cards.py
import faker
import random
import os
from datetime import timedelta, date, datetime

fake_co = faker.Faker('es_CO')

output_file = "05_insert_cards.sql"

# IDs máximos de scripts anteriores
# De 04_insert_users.sql (asumimos 25,000 usuarios generados)
max_user_id = 2500000 // 100

# Data Scaling: Original target ~2.3-2.4M cards, with ~2M active. Scaled by 1/100.
num_total_cards_original_target = 2400000
min_active_cards_original_target = 2000000

num_total_cards = num_total_cards_original_target // 100 # Approx 24,000
min_active_cards_target = min_active_cards_original_target // 100 # Approx 20,000

print(f"Generating {num_total_cards} scaled card records (aiming for ~{min_active_cards_target} active)...")

card_statuses = ['active', 'inactive', 'blocked', 'lost']
all_records_strings = []
card_id_counter = 0
active_cards_count = 0
# Keep track of users who got a card to try and give each user at least one "active" card if possible
assigned_users_for_active_cards = set()


for i in range(num_total_cards):
    card_id_counter += 1
    card_id_val = card_id_counter # Python controla el ID

    card_number_val = str(fake_co.unique.random_number(digits=16, fix_len=True))
    
    user_id_val = None
    # Attempt to assign cards more evenly, ensuring most "active" card quota goes to unique users first
    if active_cards_count < min_active_cards_target and len(assigned_users_for_active_cards) < max_user_id :
        # Try to pick a user who hasn't received an "active" card yet
        potential_user_ids = list(set(range(1, max_user_id + 1)) - assigned_users_for_active_cards)
        if potential_user_ids:
            user_id_val = random.choice(potential_user_ids)
            # assigned_users_for_active_cards.add(user_id_val) # Add when confirmed active
        else: # All users got one attempt for active card, assign randomly
            user_id_val = random.randint(1, max_user_id)
    else: # Assign randomly if active card target met or all users have one attempt
        user_id_val = random.randint(1, max_user_id)


    acquisition_date_obj = fake_co.date_between(start_date='-8y', end_date='today')
    acquisition_date_val = acquisition_date_obj.strftime('%Y-%m-%d')
    
    status_val = 'inactive'
    if active_cards_count < min_active_cards_target:
        status_val = random.choices(card_statuses, weights=[0.90, 0.05, 0.03, 0.02], k=1)[0] # Higher chance of active
    else:
        status_val = random.choices(card_statuses, weights=[0.70, 0.15, 0.10, 0.05], k=1)[0] # Normal distribution
    
    if status_val == 'active':
        active_cards_count += 1
        if user_id_val: assigned_users_for_active_cards.add(user_id_val)

    balance_val = 0.0
    if status_val == 'active' and random.random() < 0.8:
        balance_val = round(random.uniform(1000, 50000) / 50) * 50
        
    last_used_date_val_str = "NULL"
    if status_val == 'active' and random.random() < 0.9:
        # Ensure last_used_date is after acquisition_date
        try:
            last_used_datetime_obj = fake_co.date_time_between_dates(datetime_start=datetime.combine(acquisition_date_obj, datetime.min.time()), datetime_end=datetime.now())
            last_used_date_val_str = f"'{last_used_datetime_obj.strftime('%Y-%m-%d %H:%M:%S')}'"
        except: # Fallback if acquisition_date is today
            last_used_date_val_str = f"'{datetime.combine(acquisition_date_obj, fake_co.time_object()).strftime('%Y-%m-%d %H:%M:%S')}'"

    update_date_obj = fake_co.date_between_dates(date_start=acquisition_date_obj, date_end=date.today())
    update_date_val = update_date_obj.strftime('%Y-%m-%d')

    all_records_strings.append(
        f"({card_id_val}, '{card_number_val}', {user_id_val}, '{acquisition_date_val}', '{status_val}', "
        f"{balance_val}, {last_used_date_val_str}, '{update_date_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO cards (card_id, card_number, user_id, acquisition_date, status, balance, last_used_date, update_date) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO cards (card_id, card_number, user_id, acquisition_date, status, balance, last_used_date, update_date) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for cards generated successfully: {os.path.abspath(output_file)}")
    print(f"Actual active cards generated: {active_cards_count}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 24000 scaled card records (aiming for ~20000 active)...
SQL script for cards generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/05_insert_cards.sql
Actual active cards generated: 21268


In [18]:
# Nombre del archivo: generate_recharge_points.py
import faker
import random
import os

fake_co = faker.Faker('es_CO')

output_file = "06_insert_recharge_points.sql"

# IDs máximos de scripts anteriores
max_location_id = 13 # De 01_insert_locations.sql

# --- NÚMERO FIJO DE PUNTOS DE RECARGA ---
# Usaremos un número fijo para que el script de recargas sepa el máximo ID exacto.
# Este número está dentro del rango random.randint(4800, 4900) que tenías.
FIXED_NUM_RECHARGE_POINTS = 4850
num_recharge_points = FIXED_NUM_RECHARGE_POINTS

print(f"Generating {num_recharge_points} recharge point records...")

recharge_point_operators = ["PuntoRed", "SuRed", "MoviiRed", "PagaTodo", "Station Kiosk", "Online Platform"]
all_records_strings = []
recharge_point_id_counter = 0
online_platform_created = False # Para asegurar solo una plataforma online principal

for i in range(num_recharge_points):
    recharge_point_id_counter += 1
    recharge_point_id_val = recharge_point_id_counter

    operator_val = random.choice(recharge_point_operators)
    
    name_content = ""
    address_sql_formatted = "NULL"
    latitude_sql_formatted = "NULL"
    longitude_sql_formatted = "NULL"
    location_id_sql_formatted = "NULL"

    is_online_platform_scenario = False
    if operator_val == "Online Platform":
        if not online_platform_created:
            name_content = "Plataforma de Recarga Online Principal"
            online_platform_created = True
            is_online_platform_scenario = True
        else:
            # Si ya se creó la online, forzar otro tipo de operador para este punto
            operator_val = random.choice([op for op in recharge_point_operators if op != "Online Platform"])
            # Y continuar como si fuera un punto físico (más abajo)

    if not is_online_platform_scenario: # Para todos los puntos físicos
        point_type = random.choice(["Tienda", "Papelería", "Droguería", "Miscelánea", "Kiosko Estación"])
        # Generar nombre y dirección base
        base_name_part1 = fake_co.company().split(' ')[0].replace(',', '').replace("'", "''")
        base_name_part2 = fake_co.street_name().split(' ')[-1].replace("'", "''")
        name_content = f"{point_type} {base_name_part1}-{base_name_part2} {random.randint(1,100)}"

        street_t = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
        address_raw_content = f"{street_t} {random.randint(1,200)} # {random.randint(1,99)}-{random.randint(1,99)}"
        # Escapar comillas internas y añadir comillas externas para SQL
        address_sql_formatted = f"'{address_raw_content.replace("'", "''")}'"
        
        latitude_sql_formatted = str(round(random.uniform(4.40, 4.80), 6))
        longitude_sql_formatted = str(round(random.uniform(-74.20, -74.00), 6))
        if max_location_id > 0:
            location_id_sql_formatted = str(random.randint(1, max_location_id))

    # Escapar comillas simples en el nombre final
    name_val_sql_escaped = name_content.replace("'", "''")
    
    all_records_strings.append(
        f"({recharge_point_id_val}, '{name_val_sql_escaped}', {address_sql_formatted}, "
        f"{latitude_sql_formatted}, {longitude_sql_formatted}, "
        f"{location_id_sql_formatted}, '{operator_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO recharge_points (recharge_point_id, name, address, latitude, longitude, location_id, operator) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO recharge_points (recharge_point_id, name, address, latitude, longitude, location_id, operator) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for recharge_points (fixed {num_recharge_points} records) generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 4850 recharge point records...
SQL script for recharge_points (fixed 4850 records) generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/06_insert_recharge_points.sql


In [12]:
# Nombre del archivo: generate_fares.py
import os
from datetime import date

output_file = "07_insert_fares.sql"

# Basado en tarifas discutidas y el script 11_insert_fares.sql anterior
fare_data_definitions = [
    {
        "fare_id": 1,
        "fare_type": "STANDARD_SITP",
        "value": 2950.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL",
        "description": "Tarifa estándar del componente Troncal y Zonal del SITP."
    },
    {
        "fare_id": 2,
        "fare_type": "TRANSFER_0_COST",
        "value": 0.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL",
        "description": "Transbordo sin costo adicional (dentro de la ventana de tiempo y condiciones)."
    },
    {
        "fare_id": 3,
        "fare_type": "TRANSFER_200_COST",
        "value": 200.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL",
        "description": "Transbordo con costo de $200 COP (dentro de la ventana de tiempo y condiciones)."
    },
    {
        "fare_id": 4,
        "fare_type": "STANDARD_CABLE",
        "value": 2950.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL",
        "description": "Tarifa estándar para TransMiCable."
    }
]

print(f"Generating {len(fare_data_definitions)} fare records...")
all_records_strings = []

for fare_def in fare_data_definitions:
    fare_id_val = fare_def["fare_id"] # Python controla el ID
    fare_type_val = fare_def["fare_type"].replace("'", "''")
    value_val = fare_def["value"]
    start_date_val = fare_def["start_date"]
    end_date_val_str = f"'{fare_def['end_date']}'" if fare_def["end_date"] != "NULL" else "NULL"
    description_val = fare_def["description"].replace("'", "''")
    
    all_records_strings.append(
        f"({fare_id_val}, '{fare_type_val}', {value_val}, '{start_date_val}', {end_date_val_str}, '{description_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO fares (fare_id, fare_type, value, start_date, end_date, description) OVERRIDING SYSTEM VALUE VALUES\n")
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for fares generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 4 fare records...
SQL script for fares generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/07_insert_fares.sql


In [19]:
# Nombre del archivo: generate_recharges.py
import faker
import random
import os
from datetime import datetime, timedelta
import uuid

fake_co = faker.Faker('es_CO')

output_file = "08_insert_recharges.sql"

# IDs máximos de scripts anteriores
max_card_id = 2400000 // 100 # De 05_insert_cards.sql (24,000 tarjetas)
assumed_active_card_threshold_id = 2000000 // 100

# --- USAR EL MISMO NÚMERO FIJO QUE EN generate_recharge_points.py ---
FIXED_NUM_RECHARGE_POINTS = 4850
max_recharge_point_id = FIXED_NUM_RECHARGE_POINTS # Asegura que no se exceda el ID máximo real

num_recharges = random.randint(max_card_id * 2, max_card_id * 4) # Ej: entre 48k y 96k recargas

print(f"Generating {num_recharges} recharge records (max_recharge_point_id set to {max_recharge_point_id})...")

recharge_amounts_cop = [2000, 2200, 2950, 5000, 5900, 10000, 11800, 15000, 20000, 23600, 30000, 50000, 100000]
recharge_amounts_weights = [10, 5, 10, 20, 5, 25, 10, 5, 20, 5, 3, 10, 2]

all_records_strings = []
recharge_id_counter = 0

for i in range(num_recharges):
    recharge_id_counter += 1
    recharge_id_val = recharge_id_counter

    if random.random() < 0.85 and assumed_active_card_threshold_id > 0:
        card_id_val = random.randint(1, assumed_active_card_threshold_id)
    else:
        card_id_val = random.randint(1, max_card_id)

    # recharge_point_id_val ahora usará el límite correcto
    recharge_point_id_val = random.randint(1, max_recharge_point_id) 
    amount_val = random.choices(recharge_amounts_cop, weights=recharge_amounts_weights, k=1)[0]

    # Fechas de recarga contextualizadas alrededor de Junio 2024
    reference_date_for_recharge = datetime(2024, 6, 15)
    # Genera recargas en los 3 años anteriores a Junio 2024
    days_back = random.randint(1, 3 * 365) 
    hour_val = random.randint(6, 22) # Horas de recarga más comunes
    minute_val = random.randint(0, 59)
    second_val = random.randint(0, 59)
    
    try:
        recharge_datetime_obj = reference_date_for_recharge - timedelta(days=days_back)
        recharge_datetime_obj = recharge_datetime_obj.replace(hour=hour_val, minute=minute_val, second=second_val)
    except ValueError: # Manejo de fechas inválidas como 29 de febrero en año no bisiesto
        recharge_datetime_obj = reference_date_for_recharge - timedelta(days=days_back + 1) # Ajustar al día anterior
        recharge_datetime_obj = recharge_datetime_obj.replace(hour=hour_val, minute=minute_val, second=second_val)


    recharge_timestamp_val = recharge_datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
    transaction_id_val = str(uuid.uuid4())

    all_records_strings.append(
        f"({recharge_id_val}, {card_id_val}, {recharge_point_id_val}, {amount_val}, '{recharge_timestamp_val}', '{transaction_id_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO recharges (recharge_id, card_id, recharge_point_id, amount, recharge_timestamp, transaction_id) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO recharges (recharge_id, card_id, recharge_point_id, amount, recharge_timestamp, transaction_id) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for recharges generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating 67646 recharge records (max_recharge_point_id set to 4850)...
SQL script for recharges generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/08_insert_recharges.sql


In [15]:
# Nombre del archivo: generate_stations.py
import faker
import random
import os

fake_co = faker.Faker('es_CO')

output_file = "09_insert_stations.sql"

# IDs máximos de scripts anteriores
max_location_id = 13 # De 01_insert_locations.sql

# --- Station Counts from PDF ---
NUM_PORTALS = 9
NUM_CABLE_STATIONS_PDF = 4
TOTAL_STATIONS_INC_CABLE_PDF = 142
NUM_TRONCAL_STATIONS_OTHER_PDF = TOTAL_STATIONS_INC_CABLE_PDF - NUM_PORTALS - NUM_CABLE_STATIONS_PDF
NUM_ZONAL_PARADEROS_PDF = 7623

NUM_STATIONS_WITH_CYCLE_PARKING_PDF = 27
TOTAL_CYCLE_PARKING_SPOTS_PDF = 7351

print(f"Generating records for {NUM_PORTALS} portals, {NUM_CABLE_STATIONS_PDF} cable, {NUM_TRONCAL_STATIONS_OTHER_PDF} other troncal stations, and {NUM_ZONAL_PARADEROS_PDF} zonal paraderos...")

portal_names_list = [
    "Portal Américas", "Portal del Norte", "Portal Suba", "Portal Calle 80",
    "Portal del Sur", "Portal Eldorado", "Portal Tunal", "Portal 20 de Julio", "Portal Usme"
]
if len(portal_names_list) > NUM_PORTALS: portal_names_list = portal_names_list[:NUM_PORTALS]
while len(portal_names_list) < NUM_PORTALS: portal_names_list.append(f"Portal Adicional {len(portal_names_list)+1}")

cable_station_names_list = ["Portal Tunal - Cable", "Juan Pablo II - Cable", "Manitas - Cable", "Mirador del Paraíso - Cable"]
if len(cable_station_names_list) > NUM_CABLE_STATIONS_PDF: cable_station_names_list = cable_station_names_list[:NUM_CABLE_STATIONS_PDF]
while len(cable_station_names_list) < NUM_CABLE_STATIONS_PDF: cable_station_names_list.append(f"Estación Cable Adicional {len(cable_station_names_list)+1}")

real_troncal_station_names = [
    "SAN MATEO - C.C. UNISUR", "Calle 100 - Marketmedios", "Banderas", "Avenida Jiménez (Centro)",
    "Toberín - Foundever", "Calle 76 - San Felipe", "TERREROS", "Calle 57 - Tecnoparque", "Alcalá - C.C. Futuro 140", "Calle 45 - American School",
    "León XIII", "Despensa", "Bosa Estación", "Universidades - CityU", "Museo Nacional", "CAD", "Paloquemao", "Ricaurte", "Sabana", "Profamilia", "Marly", "Flores",
    "Pepe Sierra", "Calle 127", "Mazurén", "Virrey", "Héroes", "Restrepo", "Santa Lucía", "Country Sur"
]
random.shuffle(real_troncal_station_names)

station_id_counter = 0
all_station_objects = [] # Lista de diccionarios
station_codes_generated = set()
stations_eligible_for_cycle_parking = []

def generate_station_code_unique(s_type, counter, zone_prefix=None):
    global station_codes_generated
    code_attempts = 0
    while code_attempts < 200:
        prefix_num_str = str(counter % 1000).zfill(3)
        if s_type == "PORTAL": code = f"P{str(counter).zfill(2)}"
        elif s_type == "CABLE": code = f"TC{str(counter).zfill(2)}"
        elif s_type.startswith("TRONCAL"):
            line_letter = chr(65 + random.randint(0, 12))
            code = f"{line_letter}{str(counter % 100).zfill(2)}"
        elif s_type == "ZONAL_PARADERO":
            p1 = zone_prefix if zone_prefix else str(random.randint(100,799))
            p2 = chr(65 + random.randint(0,25))
            p3 = str(counter % 100).zfill(2)
            code = f"{p1}{p2}{p3}"
        else: code = f"UNK{prefix_num_str}"
        
        if code not in station_codes_generated:
            station_codes_generated.add(code)
            return code
        counter += random.randint(1,10)
        code_attempts +=1
    return f"ERR{s_type}{random.randint(1000,9999)}"

# 1. Portals
for i in range(NUM_PORTALS):
    station_id_counter += 1
    name_val = portal_names_list[i].replace("'", "''")
    code_val = generate_station_code_unique("PORTAL", i + 1)
    stations_eligible_for_cycle_parking.append({"id": station_id_counter, "type": "PORTAL"})
    all_station_objects.append({
        "station_id": station_id_counter, "name": name_val, "station_code": code_val,
        "station_type": "PORTAL", "address": f"{name_val}, Bogotá D.C.".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else None,
        "latitude": round(random.uniform(4.45, 4.75), 6),
        "longitude": round(random.uniform(-74.18, -74.02), 6),
        "has_cycle_parking": False, "cycle_parking_spots": 0, "is_active": True
    })

# 2. Cable Stations
cable_station_code_counter = 0
for i in range(NUM_CABLE_STATIONS_PDF):
    station_id_counter += 1
    cable_station_code_counter +=1
    name_val = cable_station_names_list[i].replace("'", "''")
    code_val = generate_station_code_unique("CABLE", cable_station_code_counter)
    if random.random() < 0.25 : stations_eligible_for_cycle_parking.append({"id": station_id_counter, "type": "CABLE"})
    all_station_objects.append({
        "station_id": station_id_counter, "name": name_val, "station_code": code_val,
        "station_type": "CABLE", "address": f"{name_val}, Ciudad Bolívar".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else None,
        "latitude": round(random.uniform(4.50, 4.60), 6),
        "longitude": round(random.uniform(-74.17, -74.12), 6),
        "has_cycle_parking": False, "cycle_parking_spots": 0, "is_active": True
    })

# 3. Other Troncal Stations
troncal_station_code_counter = 0
for i in range(NUM_TRONCAL_STATIONS_OTHER_PDF):
    station_id_counter += 1
    troncal_station_code_counter +=1
    if i < len(real_troncal_station_names):
        name_val = real_troncal_station_names[i].replace("'", "''")
    else:
        name_val = f"Estación Troncal {fake_co.street_name().replace("'", "''")}"
    
    station_type_val = random.choice(["TRONCAL_SIMPLE", "TRONCAL_INTERMEDIA", "TRONCAL_CABECERA"])
    code_val = generate_station_code_unique(station_type_val, troncal_station_code_counter)
    if random.random() < 0.15 : stations_eligible_for_cycle_parking.append({"id": station_id_counter, "type": station_type_val})
    all_station_objects.append({
        "station_id": station_id_counter, "name": name_val, "station_code": code_val,
        "station_type": station_type_val, "address": f"Estación {name_val}, Bogotá D.C.".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else None,
        "latitude": round(random.uniform(4.55, 4.70), 6),
        "longitude": round(random.uniform(-74.12, -74.05), 6),
        "has_cycle_parking": False, "cycle_parking_spots": 0, "is_active": True
    })

# 4. Zonal Paraderos
paradero_code_internal_counter = 0
paradero_zone_prefixes = [str(random.randint(100, 799)) for _ in range(max_location_id if max_location_id > 0 else 1)]
for i in range(NUM_ZONAL_PARADEROS_PDF):
    station_id_counter += 1
    paradero_code_internal_counter +=1
    loc_id = random.randint(1, max_location_id) if max_location_id > 0 else 1
    name_val = f"Paradero {fake_co.street_name().replace("'", "''")} con {random.choice(['Kr.', 'Cl.'])} {random.randint(10,150)}"
    code_val = generate_station_code_unique("ZONAL_PARADERO", paradero_code_internal_counter, zone_prefix=paradero_zone_prefixes[loc_id-1])
    all_station_objects.append({
        "station_id": station_id_counter, "name": name_val, "station_code": code_val,
        "station_type": "ZONAL_PARADERO", "address": f"{name_val}, Bogotá D.C.".replace("'", "''"),
        "location_id": loc_id,
        "latitude": round(random.uniform(4.40, 4.80), 6),
        "longitude": round(random.uniform(-74.20, -74.00), 6),
        "has_cycle_parking": False, "cycle_parking_spots": 0, "is_active": True
    })

# --- Distribute Cycle Parking Spots ---
stations_to_receive_parking = random.sample(
    stations_eligible_for_cycle_parking, 
    min(NUM_STATIONS_WITH_CYCLE_PARKING_PDF, len(stations_eligible_for_cycle_parking))
)

if stations_to_receive_parking:
    temp_spots_assignment = {} # Store {station_id: spots}
    for station_info in stations_to_receive_parking:
        base_spots = 0
        if station_info["type"] == "PORTAL": base_spots = random.randint(150, 600)
        elif station_info["type"] == "TRONCAL_INTERMEDIA" or station_info["type"] == "TRONCAL_CABECERA": base_spots = random.randint(50, 200)
        else: base_spots = random.randint(20, 100) # CABLE or TRONCAL_SIMPLE
        temp_spots_assignment[station_info["id"]] = max(10, base_spots)

    current_assigned_total_spots = sum(temp_spots_assignment.values())
    
    if current_assigned_total_spots > 0:
        adjustment_factor = TOTAL_CYCLE_PARKING_SPOTS_PDF / current_assigned_total_spots
        final_spots_assignment = {}
        normalized_total_spots = 0
        
        ids_in_temp = list(temp_spots_assignment.keys())
        for s_id_pk in ids_in_temp:
            assigned = int(temp_spots_assignment[s_id_pk] * adjustment_factor)
            final_spots_assignment[s_id_pk] = assigned
            normalized_total_spots += assigned
        
        # Distribute remainder due to int conversion
        spots_remainder = TOTAL_CYCLE_PARKING_SPOTS_PDF - normalized_total_spots
        idx = 0
        while spots_remainder > 0 and idx < len(ids_in_temp):
            final_spots_assignment[ids_in_temp[idx]] += 1
            spots_remainder -= 1
            idx = (idx + 1) % len(ids_in_temp) # Cycle through eligible stations
        
        # Update the main list of objects
        for station_obj in all_station_objects:
            if station_obj["station_id"] in final_spots_assignment:
                station_obj["cycle_parking_spots"] = final_spots_assignment[station_obj["station_id"]]
                if station_obj["cycle_parking_spots"] > 0:
                    station_obj["has_cycle_parking"] = True
                else: # Ensure consistency if spots ended up as 0 after adjustment
                    station_obj["has_cycle_parking"] = False


# --- Write to SQL file ---
all_records_strings_for_sql = []
for record_obj in all_station_objects:
    # Handle NULL for location_id properly
    loc_id_sql = str(record_obj['location_id']) if record_obj['location_id'] is not None else "NULL"
    
    # Python booleans True/False are directly converted to TRUE/FALSE in f-string for SQL
    all_records_strings_for_sql.append(
        f"({record_obj['station_id']}, '{record_obj['name']}', '{record_obj['station_code']}', "
        f"'{record_obj['station_type']}', '{record_obj['address']}', {loc_id_sql}, "
        f"{record_obj['latitude']}, {record_obj['longitude']}, {record_obj['has_cycle_parking']}, "
        f"{record_obj['cycle_parking_spots']}, {record_obj['is_active']})"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO stations (station_id, name, station_code, station_type, address, location_id, latitude, longitude, has_cycle_parking, cycle_parking_spots, is_active) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings_for_sql):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings_for_sql) - 1:
                file.write(";\n")
                file.write("INSERT INTO stations (station_id, name, station_code, station_type, address, location_id, latitude, longitude, has_cycle_parking, cycle_parking_spots, is_active) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings_for_sql) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for stations generated successfully: {os.path.abspath(output_file)}")
    print(f"Total station records generated: {station_id_counter}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating records for 9 portals, 4 cable, 129 other troncal stations, and 7623 zonal paraderos...
SQL script for stations generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/09_insert_stations.sql
Total station records generated: 7765


In [20]:
# Nombre del archivo: generate_vehicles.py
import faker
import random
import os

fake_co = faker.Faker('es_CO')
output_file = "10_insert_vehicles.sql"

# IDs máximos de scripts anteriores
max_concessionaire_id = 27 # De 02_insert_concessionaires.sql
max_depot_id = 58          # De 03_insert_depots.sql

print("Generating vehicle records...")

vehicle_specs_list = [
    {"type": "ALIMENTADOR_50", "capacity": 50, "count": 86, "component": "ALIMENTACION"},
    {"type": "ALIMENTADOR_80", "capacity": 80, "count": 862, "component": "ALIMENTACION"},
    {"type": "ARTICULADO", "capacity": 160, "count": 602, "component": "TRONCAL"},
    {"type": "BIARTICULADO", "capacity": 250, "count": 1317, "component": "TRONCAL"},
    {"type": "PADRON_DUAL", "capacity": 80, "count": 272, "component": "TRONCAL"}, # O podría ser DUAL si se maneja como un componente separado
    {"type": "BUS_19", "capacity": 19, "count": 5, "component": "ZONAL"},
    {"type": "BUS_40", "capacity": 40, "count": 611, "component": "ZONAL"},
    {"type": "BUS_50", "capacity": 50, "count": 3511, "component": "ZONAL"},
    {"type": "BUS_80", "capacity": 80, "count": 3297, "component": "ZONAL"},
    # Podríamos añadir CABLE_CABIN si es necesario, el PDF suma 10563 sin cabinas de cable explícitamente en esa tabla.
]
total_vehicles_to_generate = sum(spec["count"] for spec in vehicle_specs_list)

# Distribución de tecnologías basada en el PDF (página 10)
tech_distribution_template = {
    "ELECTRICO": 1486, "GNV": 2144, "HIBRIDO": 348,
    "DIESEL_EURO_VI": 2382, "DIESEL_EURO_V": 4162
}
# Ajustar si la suma no da el total_vehicles_to_generate (10522 vs 10563)
tech_sum = sum(tech_distribution_template.values())
if total_vehicles_to_generate > tech_sum:
    tech_distribution_template["DIESEL_EURO_V"] += (total_vehicles_to_generate - tech_sum)
elif tech_sum > total_vehicles_to_generate: # Improbable con los datos del PDF, pero por si acaso
    # Reducir proporcionalmente o de la categoría más grande
    diff = tech_sum - total_vehicles_to_generate
    tech_distribution_template["DIESEL_EURO_V"] -= diff # Asumiendo que Diesel Euro V puede absorber la diferencia

technologies_list_flat = []
for tech, count in tech_distribution_template.items():
    technologies_list_flat.extend([tech] * count)
random.shuffle(technologies_list_flat)

# Distribución de años de modelo (aproximada y simplificada del PDF)
model_year_dist_template = {
    2023: 336, 2022: 1007, 2021: 1799, 2020: 1624, 2019: 313, 2017: 63, 2016: 160,
    2015: 1264, 2014: 333, 2013: 335, 2012: 292, 2011: 142, 2010: 234, 2009: 53, 2008: 15
}
model_years_list_flat = []
for year, count in model_year_dist_template.items():
    model_years_list_flat.extend([year] * count)

remaining_vehicles_for_year = total_vehicles_to_generate - len(model_years_list_flat)
if remaining_vehicles_for_year > 0:
    # Distribuir los restantes en un rango plausible, ej. 2010-2018 con más peso en años intermedios
    additional_years = random.choices(
        population=list(range(2010, 2019)), 
        weights=[1,1,2,2,3,3,2,2,1], # Pesos para un pico alrededor de 2014-2016
        k=remaining_vehicles_for_year
    )
    model_years_list_flat.extend(additional_years)
random.shuffle(model_years_list_flat)


all_records_strings = []
vehicle_id_counter = 0

# Asignación simplificada de concesionarios por tipo de componente de vehículo
# Debería coincidir con cómo se definieron en 02_insert_concessionaires.py
concessionaires_by_type_from_script6 = {
    "TRONCAL": [c["id"] for c in [
        {"id": 1, "troncal": True}, {"id": 2, "troncal": True}, {"id": 3, "troncal": True},
        {"id": 4, "troncal": True}, {"id": 5, "troncal": True}, {"id": 6, "troncal": True},
        {"id": 7, "troncal": True}, {"id": 8, "troncal": True}, {"id": 9, "troncal": True},
        {"id": 26, "troncal": True}
    ] if c["troncal"]],
    "ZONAL_UCE": [c["id"] for c in [
        {"id": 4, "zonal_uce": True}, {"id": 8, "zonal_uce": True}, {"id": 9, "zonal_uce": True},
        {"id": 11, "zonal_uce": True}, {"id": 12, "zonal_uce": True}, {"id": 14, "zonal_uce": True},
        {"id": 15, "zonal_uce": True}, {"id": 16, "zonal_uce": True}, {"id": 17, "zonal_uce": True},
        {"id": 18, "zonal_uce": True}, {"id": 19, "zonal_uce": True}, {"id": 20, "zonal_uce": True},
        {"id": 22, "zonal_uce": True}, {"id": 23, "zonal_uce": True}, {"id": 25, "zonal_uce": True},
        {"id": 27, "zonal_uce": True}
    ] if c["zonal_uce"]],
    "ALIMENTACION": [c["id"] for c in [
        {"id": 8, "zonal_alimentacion": True}, {"id": 9, "zonal_alimentacion": True},
        {"id": 10, "zonal_alimentacion": True}, {"id": 11, "zonal_alimentacion": True},
        {"id": 12, "zonal_alimentacion": True}, {"id": 13, "zonal_alimentacion": True},
        {"id": 21, "zonal_alimentacion": True}, {"id": 25, "zonal_alimentacion": True}
    ] if c["zonal_alimentacion"]]
}
# Asegurar que las listas no estén vacías
if not concessionaires_by_type_from_script6["TRONCAL"]: concessionaires_by_type_from_script6["TRONCAL"] = [1]
if not concessionaires_by_type_from_script6["ZONAL_UCE"]: concessionaires_by_type_from_script6["ZONAL_UCE"] = [14]
if not concessionaires_by_type_from_script6["ALIMENTACION"]: concessionaires_by_type_from_script6["ALIMENTACION"] = [10]


for spec in vehicle_specs_list:
    for _ in range(spec["count"]):
        vehicle_id_counter += 1
        vehicle_id_val = vehicle_id_counter

        license_plate_val = fake_co.unique.license_plate()
        vehicle_type_val = spec["type"]
        capacity_val = spec["capacity"]
        
        tech_val = technologies_list_flat.pop() if technologies_list_flat else "DIESEL_EURO_V"
        model_year_val = model_years_list_flat.pop() if model_years_list_flat else random.randint(2010, 2018)

        con_id_val = None
        if spec["component"] == "TRONCAL" and concessionaires_by_type_from_script6["TRONCAL"]:
            con_id_val = random.choice(concessionaires_by_type_from_script6["TRONCAL"])
        elif spec["component"] == "ZONAL" and concessionaires_by_type_from_script6["ZONAL_UCE"]:
            con_id_val = random.choice(concessionaires_by_type_from_script6["ZONAL_UCE"])
        elif spec["component"] == "ALIMENTACION" and concessionaires_by_type_from_script6["ALIMENTACION"]:
            con_id_val = random.choice(concessionaires_by_type_from_script6["ALIMENTACION"])
        else: # Fallback
            con_id_val = random.randint(1, max_concessionaire_id)
            
        status_val = "active" if random.random() < 0.95 else random.choice(["maintenance", "inactive"])
        current_depot_id_val_str = str(random.randint(1, max_depot_id)) if max_depot_id > 0 else "NULL"

        all_records_strings.append(
            f"({vehicle_id_val}, '{license_plate_val}', '{vehicle_type_val}', {capacity_val}, '{tech_val}', "
            f"{model_year_val}, {con_id_val}, '{status_val}', {current_depot_id_val_str})"
        )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO vehicles (vehicle_id, license_plate, vehicle_type, capacity, technology, model_year, concessionaire_id, status, current_depot_id) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO vehicles (vehicle_id, license_plate, vehicle_type, capacity, technology, model_year, concessionaire_id, status, current_depot_id) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for vehicles generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating vehicle records...
SQL script for vehicles generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/10_insert_vehicles.sql


In [21]:
# Nombre del archivo: generate_drivers.py
import faker
import random
import os
from datetime import timedelta, date

fake_co = faker.Faker('es_CO')
output_file = "11_insert_drivers.sql"

max_concessionaire_id = 27 # De 02_insert_concessionaires.sql

print("Generating driver records...")

driver_counts_by_component = {
    "TRONCAL": 5003,
    "ZONAL_UCE": 16544,
    "ALIMENTACION": 2899
}
total_drivers_to_generate = sum(driver_counts_by_component.values())

# Misma lógica simplificada para asignar concesionarios que en generate_vehicles
concessionaires_by_type_from_script6 = {
    "TRONCAL": [c["id"] for c in [
        {"id": 1, "troncal": True}, {"id": 2, "troncal": True}, {"id": 3, "troncal": True},
        {"id": 4, "troncal": True}, {"id": 5, "troncal": True}, {"id": 6, "troncal": True},
        {"id": 7, "troncal": True}, {"id": 8, "troncal": True}, {"id": 9, "troncal": True},
        {"id": 26, "troncal": True}
    ] if c["troncal"]],
    "ZONAL_UCE": [c["id"] for c in [
        {"id": 4, "zonal_uce": True}, {"id": 8, "zonal_uce": True}, {"id": 9, "zonal_uce": True},
        {"id": 11, "zonal_uce": True}, {"id": 12, "zonal_uce": True}, {"id": 14, "zonal_uce": True},
        {"id": 15, "zonal_uce": True}, {"id": 16, "zonal_uce": True}, {"id": 17, "zonal_uce": True},
        {"id": 18, "zonal_uce": True}, {"id": 19, "zonal_uce": True}, {"id": 20, "zonal_uce": True},
        {"id": 22, "zonal_uce": True}, {"id": 23, "zonal_uce": True}, {"id": 25, "zonal_uce": True},
        {"id": 27, "zonal_uce": True}
    ] if c["zonal_uce"]],
    "ALIMENTACION": [c["id"] for c in [
        {"id": 8, "zonal_alimentacion": True}, {"id": 9, "zonal_alimentacion": True},
        {"id": 10, "zonal_alimentacion": True}, {"id": 11, "zonal_alimentacion": True},
        {"id": 12, "zonal_alimentacion": True}, {"id": 13, "zonal_alimentacion": True},
        {"id": 21, "zonal_alimentacion": True}, {"id": 25, "zonal_alimentacion": True}
    ] if c["zonal_alimentacion"]]
}
if not concessionaires_by_type_from_script6["TRONCAL"]: concessionaires_by_type_from_script6["TRONCAL"] = [1]
if not concessionaires_by_type_from_script6["ZONAL_UCE"]: concessionaires_by_type_from_script6["ZONAL_UCE"] = [14]
if not concessionaires_by_type_from_script6["ALIMENTACION"]: concessionaires_by_type_from_script6["ALIMENTACION"] = [10]


driver_component_pool = []
for component_type, count in driver_counts_by_component.items():
    driver_component_pool.extend([component_type] * count)
random.shuffle(driver_component_pool)

all_records_strings = []
driver_id_counter = 0

for i in range(total_drivers_to_generate):
    driver_id_counter += 1
    driver_id_val = driver_id_counter

    # Usar un formato diferente para employee_id para asegurar unicidad más fácil
    employee_id_val = f"E{str(fake_co.unique.random_number(digits=8, fix_len=True))}"
    first_name_val = fake_co.first_name().replace("'", "''")
    last_name_val = fake_co.last_name().replace("'", "''")
    
    assigned_component = driver_component_pool[i]
    con_id_val = None
    if assigned_component == "TRONCAL" and concessionaires_by_type_from_script6["TRONCAL"]:
        con_id_val = random.choice(concessionaires_by_type_from_script6["TRONCAL"])
    elif assigned_component == "ZONAL_UCE" and concessionaires_by_type_from_script6["ZONAL_UCE"]:
        con_id_val = random.choice(concessionaires_by_type_from_script6["ZONAL_UCE"])
    elif assigned_component == "ALIMENTACION" and concessionaires_by_type_from_script6["ALIMENTACION"]:
        con_id_val = random.choice(concessionaires_by_type_from_script6["ALIMENTACION"])
    else: # Fallback
        con_id_val = random.randint(1, max_concessionaire_id)

    hire_date_obj = fake_co.date_between(start_date='-15y', end_date='-30d') # Contratado al menos hace 30 días
    hire_date_val = hire_date_obj.strftime('%Y-%m-%d')
    
    license_number_val = f"{chr(random.randint(65,90))}{chr(random.randint(65,90))}{str(fake_co.unique.random_number(digits=6, fix_len=True))}"
    
    license_expiry_date_obj = fake_co.date_between(start_date='today', end_date='+5y')
    license_expiry_date_val = license_expiry_date_obj.strftime('%Y-%m-%d')
    
    status_val = "active" if random.random() < 0.92 else random.choice(["on_leave", "inactive"])

    all_records_strings.append(
        f"({driver_id_val}, '{employee_id_val}', '{first_name_val}', '{last_name_val}', {con_id_val}, "
        f"'{hire_date_val}', '{license_number_val}', '{license_expiry_date_val}', '{status_val}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO drivers (driver_id, employee_id, first_name, last_name, concessionaire_id, hire_date, license_number, license_expiry_date, status) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO drivers (driver_id, employee_id, first_name, last_name, concessionaire_id, hire_date, license_number, license_expiry_date, status) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for drivers generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating driver records...
SQL script for drivers generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/11_insert_drivers.sql


In [22]:
# Nombre del archivo: generate_routes.py
import faker
import random
import os

fake_co = faker.Faker('es_CO')
output_file = "12_insert_routes.sql"

# IDs máximos/rangos de scripts anteriores
max_concessionaire_id = 27 # De 02_insert_concessionaires.sql
# De 09_insert_stations.sql
NUM_PORTALS_GEN = 9
NUM_CABLE_STATIONS_GEN = 4
TOTAL_STATIONS_INC_CABLE_GEN = 142
NUM_TRONCAL_STATIONS_OTHER_GEN = TOTAL_STATIONS_INC_CABLE_GEN - NUM_PORTALS_GEN - NUM_CABLE_STATIONS_GEN
NUM_ZONAL_PARADEROS_GEN = 7623 # Este es el conteo usado en generate_stations

PORTAL_IDS_LIST = list(range(1, NUM_PORTALS_GEN + 1))
CABLE_STATION_IDS_LIST = list(range(NUM_PORTALS_GEN + 1, NUM_PORTALS_GEN + NUM_CABLE_STATIONS_GEN + 1))
TRONCAL_STATIONS_OTHER_IDS_LIST = list(range(NUM_PORTALS_GEN + NUM_CABLE_STATIONS_GEN + 1, TOTAL_STATIONS_INC_CABLE_GEN + 1))
ZONAL_PARADERO_IDS_LIST = list(range(TOTAL_STATIONS_INC_CABLE_GEN + 1, TOTAL_STATIONS_INC_CABLE_GEN + NUM_ZONAL_PARADEROS_GEN + 1))
ALL_TRONCAL_TYPE_STATIONS_LIST = PORTAL_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST


# Concessionaire assignment (misma lógica simplificada que en vehicles/drivers)
concessionaires_by_type_from_script6 = {
    "TRONCAL": [c["id"] for c in [{"id": 1, "troncal": True}, {"id": 2, "troncal": True}, {"id": 3, "troncal": True}, {"id": 4, "troncal": True}, {"id": 5, "troncal": True}, {"id": 6, "troncal": True}, {"id": 7, "troncal": True}, {"id": 8, "troncal": True}, {"id": 9, "troncal": True}, {"id": 26, "troncal": True}] if c["troncal"]],
    "ZONAL_UCE": [c["id"] for c in [{"id": 4, "zonal_uce": True}, {"id": 8, "zonal_uce": True}, {"id": 9, "zonal_uce": True},{"id": 11, "zonal_uce": True}, {"id": 12, "zonal_uce": True}, {"id": 14, "zonal_uce": True},{"id": 15, "zonal_uce": True}, {"id": 16, "zonal_uce": True}, {"id": 17, "zonal_uce": True},{"id": 18, "zonal_uce": True}, {"id": 19, "zonal_uce": True}, {"id": 20, "zonal_uce": True},{"id": 22, "zonal_uce": True}, {"id": 23, "zonal_uce": True}, {"id": 25, "zonal_uce": True},{"id": 27, "zonal_uce": True}] if c["zonal_uce"]],
    "ALIMENTACION": [c["id"] for c in [{"id": 8, "zonal_alimentacion": True}, {"id": 9, "zonal_alimentacion": True},{"id": 10, "zonal_alimentacion": True}, {"id": 11, "zonal_alimentacion": True},{"id": 12, "zonal_alimentacion": True}, {"id": 13, "zonal_alimentacion": True},{"id": 21, "zonal_alimentacion": True}, {"id": 25, "zonal_alimentacion": True}] if c["zonal_alimentacion"]],
    "CABLE": [c["id"] for c in [{"id": 24, "cable": True}] if c["cable"]]
}
if not concessionaires_by_type_from_script6["TRONCAL"]: concessionaires_by_type_from_script6["TRONCAL"] = [1]
if not concessionaires_by_type_from_script6["ZONAL_UCE"]: concessionaires_by_type_from_script6["ZONAL_UCE"] = [14]
if not concessionaires_by_type_from_script6["ALIMENTACION"]: concessionaires_by_type_from_script6["ALIMENTACION"] = [10]
if not concessionaires_by_type_from_script6["CABLE"]: concessionaires_by_type_from_script6["CABLE"] = [24]


# Route counts from PDF
route_counts = {
    "TRONCAL": 99, "DUAL": 5, "ZONAL_UCE": 347, "ALIMENTADORA": 106, "CABLE": 1
}
dual_route_codes_pdf = ["DM81", "MK86", "ML82", "MC84", "M83"]
zonal_route_codes_pdf_examples = ["T11", "T13", "BH907", "330", "T25", "CG147", "94", "SE14", "614", "SE6"]


print("Generating route records...")
all_records_strings = []
route_id_counter = 0
generated_route_codes_set = set()

def generate_unique_route_code(r_type, counter, predefined_list=None):
    global generated_route_codes_set
    attempts = 0
    max_attempts = 100
    
    if predefined_list and counter < len(predefined_list):
        code = predefined_list[counter]
        if code not in generated_route_codes_set:
            generated_route_codes_set.add(code)
            return code
        # If predefined code already used (should not happen if list is unique), fall through to generate
        
    while attempts < max_attempts:
        if r_type == "TRONCAL":
            p = random.choice(["B","C","D","E","F","G","H","J","K","L","M"])
            n = random.randint(1,99)
            code = f"{p}{n}"
        elif r_type == "DUAL": # Will be stored as TRONCAL type in DB as per current schema
            p = random.choice(["DM","MD","ML","CM","MF"])
            n = random.randint(80,99)
            code = f"{p}{n}"
        elif r_type == "ZONAL_UCE":
            if random.random() < 0.5: code = str(random.randint(10, 999))
            else: code = f"{random.choice(['A','B','C','E','H','K','P','S','T','Z'])}{random.randint(10,999)}"
            if random.random() < 0.2: code = f"SITP{code}"
        elif r_type == "ALIMENTADORA":
            code = f"{random.randint(1,16)}-{random.randint(1,12)}"
        elif r_type == "CABLE":
            code = f"TC{counter+1}"
        else:
            code = f"R{str(counter).zfill(4)}"

        if code not in generated_route_codes_set:
            generated_route_codes_set.add(code)
            return code
        counter +=1 # Try different counter for generation
        attempts +=1
    return f"ERR_CODE_{r_type}_{random.randint(1000,9999)}"


route_definitions_list = [
    {"type_pdf": "TRONCAL", "type_db": "TRONCAL", "count": route_counts["TRONCAL"], "con_ids": concessionaires_by_type_from_script6["TRONCAL"], "orig_pool": ALL_TRONCAL_TYPE_STATIONS_LIST, "dest_pool": ALL_TRONCAL_TYPE_STATIONS_LIST, "predefined_codes": None},
    {"type_pdf": "DUAL", "type_db": "TRONCAL", "count": route_counts["DUAL"], "con_ids": concessionaires_by_type_from_script6["TRONCAL"], "orig_pool": PORTAL_IDS_LIST, "dest_pool": PORTAL_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST, "predefined_codes": dual_route_codes_pdf}, # Dual routes map to TRONCAL type in DB
    {"type_pdf": "ZONAL_UCE", "type_db": "ZONAL_UCE", "count": route_counts["ZONAL_UCE"], "con_ids": concessionaires_by_type_from_script6["ZONAL_UCE"], "orig_pool": ZONAL_PARADERO_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST, "dest_pool": ZONAL_PARADERO_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST, "predefined_codes": zonal_route_codes_pdf_examples},
    {"type_pdf": "ALIMENTADORA", "type_db": "ALIMENTADORA", "count": route_counts["ALIMENTADORA"], "con_ids": concessionaires_by_type_from_script6["ALIMENTACION"], "orig_pool": PORTAL_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST, "dest_pool": ZONAL_PARADERO_IDS_LIST, "predefined_codes": None},
    {"type_pdf": "CABLE", "type_db": "CABLE", "count": route_counts["CABLE"], "con_ids": concessionaires_by_type_from_script6["CABLE"], "orig_pool": [CABLE_STATION_IDS_LIST[0]], "dest_pool": [CABLE_STATION_IDS_LIST[-1]], "predefined_codes": None}
]

route_type_counter = {} # To use with predefined codes

for definition in route_definitions_list:
    pdf_type = definition["type_pdf"]
    db_type = definition["type_db"]
    if pdf_type not in route_type_counter:
        route_type_counter[pdf_type] = 0

    for _ in range(definition["count"]):
        route_id_counter += 1
        route_id_val = route_id_counter
        
        code_val = generate_unique_route_code(pdf_type, route_type_counter[pdf_type], definition["predefined_codes"])
        route_type_counter[pdf_type] += 1
        
        origin_id_val, dest_id_val = None, None
        if len(definition["orig_pool"]) > 0 and len(definition["dest_pool"]) > 0 :
            origin_id_val = random.choice(definition["orig_pool"])
            # Ensure destination is different from origin
            temp_dest_pool = [sid for sid in definition["dest_pool"] if sid != origin_id_val]
            if not temp_dest_pool: temp_dest_pool = definition["dest_pool"] # Fallback if all are same as origin
            dest_id_val = random.choice(temp_dest_pool) if temp_dest_pool else origin_id_val # Further fallback
        else: # Should not happen with current data
            origin_id_val = random.randint(1,10)
            dest_id_val = random.randint(1,10)


        # Simplified name, can be improved with actual station names if performance allows
        route_name_val = f"{code_val}: Origen Est.{origin_id_val} - Destino Est.{dest_id_val}".replace("'", "''")
        concessionaire_id_val = random.choice(definition["con_ids"]) if definition["con_ids"] else random.randint(1,max_concessionaire_id)
        is_active_val = True if random.random() < 0.9 else False

        all_records_strings.append(
            f"({route_id_val}, '{code_val}', '{route_name_val}', '{db_type}', "
            f"{origin_id_val}, {dest_id_val}, {concessionaire_id_val}, {is_active_val})"
        )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO routes (route_id, route_code, route_name, route_type, origin_station_id, destination_station_id, concessionaire_id, is_active) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO routes (route_id, route_code, route_name, route_type, origin_station_id, destination_station_id, concessionaire_id, is_active) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for routes generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating route records...
SQL script for routes generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/12_insert_routes.sql


In [23]:
# Nombre del archivo: generate_intermediate_stations.py
import random
import os

output_file = "13_insert_intermediate_stations.sql"

# IDs máximos/rangos de scripts anteriores
# From 12_insert_routes.sql
NUM_TRONCAL_ROUTES_GEN = 99
NUM_DUAL_ROUTES_GEN = 5
NUM_ZONAL_UCE_ROUTES_GEN = 347
NUM_ALIMENTADORA_ROUTES_GEN = 106
NUM_CABLE_ROUTES_GEN = 1
total_routes_generated = NUM_TRONCAL_ROUTES_GEN + NUM_DUAL_ROUTES_GEN + NUM_ZONAL_UCE_ROUTES_GEN + NUM_ALIMENTADORA_ROUTES_GEN + NUM_CABLE_ROUTES_GEN

# From 09_insert_stations.sql
NUM_PORTALS_GEN = 9
NUM_CABLE_STATIONS_GEN = 4
TOTAL_STATIONS_INC_CABLE_GEN = 142
NUM_TRONCAL_STATIONS_OTHER_GEN = TOTAL_STATIONS_INC_CABLE_GEN - NUM_PORTALS_GEN - NUM_CABLE_STATIONS_GEN
NUM_ZONAL_PARADEROS_GEN = 7623

PORTAL_IDS_LIST = list(range(1, NUM_PORTALS_GEN + 1))
CABLE_STATION_IDS_LIST = list(range(NUM_PORTALS_GEN + 1, NUM_PORTALS_GEN + NUM_CABLE_STATIONS_GEN + 1)) # 10, 11, 12, 13
TRONCAL_STATIONS_OTHER_IDS_LIST = list(range(NUM_PORTALS_GEN + NUM_CABLE_STATIONS_GEN + 1, TOTAL_STATIONS_INC_CABLE_GEN + 1))
ZONAL_PARADERO_IDS_LIST = list(range(TOTAL_STATIONS_INC_CABLE_GEN + 1, TOTAL_STATIONS_INC_CABLE_GEN + NUM_ZONAL_PARADEROS_GEN + 1))
ALL_TRONCAL_TYPE_STATIONS_LIST = PORTAL_IDS_LIST + TRONCAL_STATIONS_OTHER_IDS_LIST

# Route ID ranges and types (consistent with generate_routes.py)
route_id_start = 1
route_info_map = {} # Store type, origin, dest for each route_id

# Troncal Routes
for r_id in range(route_id_start, route_id_start + NUM_TRONCAL_ROUTES_GEN):
    route_info_map[r_id] = {"type": "TRONCAL", "stop_pool": ALL_TRONCAL_TYPE_STATIONS_LIST, "min_stops": 3, "max_stops": 15}
route_id_start += NUM_TRONCAL_ROUTES_GEN

# Dual Routes
for r_id in range(route_id_start, route_id_start + NUM_DUAL_ROUTES_GEN):
    route_info_map[r_id] = {"type": "DUAL", "stop_pool": ALL_TRONCAL_TYPE_STATIONS_LIST + random.sample(ZONAL_PARADERO_IDS_LIST, k=min(len(ZONAL_PARADERO_IDS_LIST), 200)), "min_stops": 8, "max_stops": 25} # Can use zonal
route_id_start += NUM_DUAL_ROUTES_GEN

# Zonal UCE Routes
for r_id in range(route_id_start, route_id_start + NUM_ZONAL_UCE_ROUTES_GEN):
    route_info_map[r_id] = {"type": "ZONAL_UCE", "stop_pool": ZONAL_PARADERO_IDS_LIST + random.sample(TRONCAL_STATIONS_OTHER_IDS_LIST, k=min(len(TRONCAL_STATIONS_OTHER_IDS_LIST),50)), "min_stops": 10, "max_stops": 40}
route_id_start += NUM_ZONAL_UCE_ROUTES_GEN

# Alimentadora Routes
for r_id in range(route_id_start, route_id_start + NUM_ALIMENTADORA_ROUTES_GEN):
    route_info_map[r_id] = {"type": "ALIMENTADORA", "stop_pool": ZONAL_PARADERO_IDS_LIST, "min_stops": 5, "max_stops": 20}
route_id_start += NUM_ALIMENTADORA_ROUTES_GEN

# Cable Routes
for r_id in range(route_id_start, route_id_start + NUM_CABLE_ROUTES_GEN):
    # Cable route has specific intermediate stations: CABLE_STATION_IDS_LIST are [Origin, Inter1, Inter2, Dest] for 4 stations
    # So intermediate are CABLE_STATION_IDS_LIST[1] and CABLE_STATION_IDS_LIST[2] if origin/dest are [0] and [-1]
    fixed_stops = CABLE_STATION_IDS_LIST[1:-1] if len(CABLE_STATION_IDS_LIST) > 2 else []
    route_info_map[r_id] = {"type": "CABLE", "stop_pool": fixed_stops, "min_stops": len(fixed_stops), "max_stops": len(fixed_stops), "is_fixed": True}


print(f"Generating intermediate station records for {total_routes_generated} routes...")
all_records_strings = []
intermediate_station_id_counter = 0

for route_id_val in range(1, total_routes_generated + 1):
    route_data = route_info_map.get(route_id_val)
    if not route_data:
        print(f"Warning: No route data found for route_id {route_id_val}")
        continue

    num_stops = random.randint(route_data["min_stops"], route_data["max_stops"])
    
    potential_stops_pool = route_data["stop_pool"]
    if not potential_stops_pool: # Skip if no pool (e.g. Cable route with only 2 stations)
        if route_data.get("is_fixed") and not potential_stops_pool : # Expected if fixed stops is empty
            pass
        else:
            print(f"Warning: Empty stop_pool for route_id {route_id_val} of type {route_data['type']}")
        continue
        
    selected_stops_for_route = []
    if route_data.get("is_fixed"):
        selected_stops_for_route = potential_stops_pool # Use the pre-defined fixed stops
    elif len(potential_stops_pool) > 0:
        num_to_sample = min(num_stops, len(potential_stops_pool))
        selected_stops_for_route = random.sample(potential_stops_pool, k=num_to_sample)
    
    # Ensure origin and destination are not in the intermediate stops (conceptually)
    # The route definition in `routes` table has origin and destination.
    # Here we are only defining the sequence of stations *between* them.

    for seq_order, station_id_val in enumerate(selected_stops_for_route):
        intermediate_station_id_counter += 1
        intermediate_station_id_val = intermediate_station_id_counter # Python controla el ID
        
        all_records_strings.append(
            f"({intermediate_station_id_val}, {route_id_val}, {station_id_val}, {seq_order + 1})"
        )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO intermediate_stations (intermediate_station_id, route_id, station_id, sequence_order) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO intermediate_stations (intermediate_station_id, route_id, station_id, sequence_order) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for intermediate_stations generated successfully: {os.path.abspath(output_file)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating intermediate station records for 558 routes...
SQL script for intermediate_stations generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/13_insert_intermediate_stations.sql


In [24]:
# Nombre del archivo: generate_trips.py
import faker
import random
import os
from datetime import datetime, timedelta, date, time
import uuid

fake_co = faker.Faker('es_CO')
output_file = "14_insert_trips.sql"

# --- Configuration ---
SIMULATION_START_DATE = date(2024, 6, 3) # Lunes
SIMULATION_END_DATE = date(2024, 6, 9)   # Domingo
SIMULATION_DAYS = (SIMULATION_END_DATE - SIMULATION_START_DATE).days + 1

# --- ID Ranges & Counts (from previous scripts - CRUCIAL for consistency) ---
# Cards (Script 05)
MAX_CARD_ID = 2400000 // 100
ASSUMED_ACTIVE_CARD_THRESHOLD_ID = 2000000 // 100

# Stations (Script 09)
MAX_STATION_ID_FROM_SCRIPT_09 = 7765 # Ajusta si el contador final en script 09 fue diferente
PORTAL_IDS_RANGE_TRIPS = (1, 9)
CABLE_STATION_IDS_RANGE_TRIPS = (10, 13)
TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS = (14, 142)
ZONAL_PARADERO_IDS_RANGE_TRIPS = (143, MAX_STATION_ID_FROM_SCRIPT_09)
ALL_TRONCAL_STATIONS_IDS_TRIPS = list(range(PORTAL_IDS_RANGE_TRIPS[0], PORTAL_IDS_RANGE_TRIPS[1] + 1)) + \
                                 list(range(TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS[0], TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS[1] + 1))
ALL_CABLE_STATIONS_IDS_TRIPS = list(range(CABLE_STATION_IDS_RANGE_TRIPS[0], CABLE_STATION_IDS_RANGE_TRIPS[1] + 1))
ALL_ZONAL_PARADERO_IDS_TRIPS = list(range(ZONAL_PARADERO_IDS_RANGE_TRIPS[0], ZONAL_PARADERO_IDS_RANGE_TRIPS[1] + 1))
ALL_STATION_IDS_TRIPS = list(range(1, MAX_STATION_ID_FROM_SCRIPT_09 + 1))


# Routes (Script 12) & Intermediate Stations (Script 13)
# Route counts from script 12
NUM_TRONCAL_ROUTES_GEN_TRIPS = 99
NUM_DUAL_ROUTES_GEN_TRIPS = 5
NUM_ZONAL_UCE_ROUTES_GEN_TRIPS = 347
NUM_ALIMENTADORA_ROUTES_GEN_TRIPS = 106
NUM_CABLE_ROUTES_GEN_TRIPS = 1
MAX_ROUTE_ID_TRIPS = NUM_TRONCAL_ROUTES_GEN_TRIPS + NUM_DUAL_ROUTES_GEN_TRIPS + NUM_ZONAL_UCE_ROUTES_GEN_TRIPS + NUM_ALIMENTADORA_ROUTES_GEN_TRIPS + NUM_CABLE_ROUTES_GEN_TRIPS

# Route ID ranges (conceptual, based on order of generation in script 12)
current_route_id_start_trips = 1
TRONCAL_ROUTE_IDS_RANGE_TRIPS = (current_route_id_start_trips, current_route_id_start_trips + NUM_TRONCAL_ROUTES_GEN_TRIPS -1)
current_route_id_start_trips += NUM_TRONCAL_ROUTES_GEN_TRIPS
DUAL_ROUTE_IDS_RANGE_TRIPS = (current_route_id_start_trips, current_route_id_start_trips + NUM_DUAL_ROUTES_GEN_TRIPS -1)
current_route_id_start_trips += NUM_DUAL_ROUTES_GEN_TRIPS
ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS = (current_route_id_start_trips, current_route_id_start_trips + NUM_ZONAL_UCE_ROUTES_GEN_TRIPS -1)
current_route_id_start_trips += NUM_ZONAL_UCE_ROUTES_GEN_TRIPS
ALIMENTADORA_ROUTE_IDS_RANGE_TRIPS = (current_route_id_start_trips, current_route_id_start_trips + NUM_ALIMENTADORA_ROUTES_GEN_TRIPS -1)
current_route_id_start_trips += NUM_ALIMENTADORA_ROUTES_GEN_TRIPS
CABLE_ROUTE_IDS_RANGE_TRIPS = (current_route_id_start_trips, current_route_id_start_trips + NUM_CABLE_ROUTES_GEN_TRIPS -1)

# Vehicles (Script 10)
MAX_VEHICLE_ID_TRIPS = 10563
# Drivers (Script 11)
MAX_DRIVER_ID_TRIPS = 24446
# Fares (Script 07)
FARE_ID_STANDARD_SITP_TRIPS = 1
FARE_ID_TRANSFER_0_TRIPS = 2
FARE_ID_TRANSFER_200_TRIPS = 3
FARE_ID_STANDARD_CABLE_TRIPS = 4

# --- Daily Trip Volume (Scaled by 1/100) ---
TRIPS_PER_WEEKDAY_SCALED = random.randint(3200000, 3800000) // 100
TRIPS_PER_SATURDAY_SCALED = random.randint(2200000, 2800000) // 100
TRIPS_PER_SUNDAY_OR_HOLIDAY_SCALED = random.randint(1300000, 1800000) // 100

PROB_TRANSFER_TRIPS = 0.18
TRANSFER_TYPE_DIST_TRIPS = {"ZONAL_TO_ZONAL": 0.49, "ZONAL_TO_TRONCAL_CABLE": 0.29, "TRONCAL_CABLE_TO_ZONAL": 0.22}
TRANSFER_WINDOW_MINUTES_TRIPS = 110

# --- Popularity Biasing (Simplified) ---
POPULAR_ZONAL_ROUTE_IDS_TRIPS = list(range(ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0], min(ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[1]+1, ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0] + 20))) # First 20 Zonal
POPULAR_TRONCAL_ROUTE_IDS_TRIPS = list(range(TRONCAL_ROUTE_IDS_RANGE_TRIPS[0], min(TRONCAL_ROUTE_IDS_RANGE_TRIPS[1]+1, TRONCAL_ROUTE_IDS_RANGE_TRIPS[0] + 20))) # First 20 Troncal
POPULAR_PORTAL_IDS_TRIPS = list(range(PORTAL_IDS_RANGE_TRIPS[0], PORTAL_IDS_RANGE_TRIPS[1] + 1))
POPULAR_TRONCAL_STATION_IDS_TRIPS = list(range(TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS[0], min(TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS[1]+1, TRONCAL_STATION_IDS_OTHER_RANGE_TRIPS[0] + 30)))


# --- Mocked Route Structures (VERY SIMPLIFIED - for standalone generation) ---
# This is a placeholder. In a real scenario with DB access, you'd query intermediate_stations.
# Here, we simulate that a route has a list of plausible stops.
# We need to pre-load/define this structure based on how 13_insert_intermediate_stations.sql was generated.
# For this script, we'll generate a *conceptual* path on the fly.
# This is a MAJOR simplification point.
route_conceptual_paths = {} # {route_id: [station1, station2, ...]}

def get_conceptual_path_for_route(route_id, route_type_actual):
    if route_id in route_conceptual_paths:
        return route_conceptual_paths[route_id]

    path = []
    num_stops_on_path = 0
    station_pool = []

    if route_type_actual == "TRONCAL" or route_type_actual == "DUAL":
        station_pool = ALL_TRONCAL_STATIONS_IDS_TRIPS
        num_stops_on_path = random.randint(5, 20)
    elif route_type_actual == "ZONAL_UCE":
        station_pool = ALL_ZONAL_PARADERO_IDS_TRIPS + random.sample(ALL_TRONCAL_STATIONS_IDS_TRIPS, k=min(len(ALL_TRONCAL_STATIONS_IDS_TRIPS), 5))
        num_stops_on_path = random.randint(10, 40)
    elif route_type_actual == "ALIMENTADORA":
        # Origin from Troncal, rest zonal
        path.append(random.choice(ALL_TRONCAL_STATIONS_IDS_TRIPS))
        station_pool = ALL_ZONAL_PARADERO_IDS_TRIPS
        num_stops_on_path = random.randint(5, 20) -1 # -1 because origin is already added
    elif route_type_actual == "CABLE":
        route_conceptual_paths[route_id] = ALL_CABLE_STATIONS_IDS_TRIPS # Assumes they are in order
        return ALL_CABLE_STATIONS_IDS_TRIPS
    
    if not station_pool: # Fallback
        route_conceptual_paths[route_id] = []
        return []

    # Add origin if not alimentadora/cable
    if route_type_actual != "ALIMENTADORA" and len(station_pool) > 0 :
        path.append(random.choice(station_pool))

    # Add intermediate and destination
    available_for_path = [s for s in station_pool if s not in path]
    num_to_sample = min(num_stops_on_path, len(available_for_path))
    if num_to_sample > 0:
        path.extend(random.sample(available_for_path, k=num_to_sample))
    
    if not path and station_pool : # If path is still empty, add at least one station
        path.append(random.choice(station_pool))

    route_conceptual_paths[route_id] = path
    return path


def get_route_type_from_id_trips(route_id): # Renamed to avoid conflict if running in same context
    if TRONCAL_ROUTE_IDS_RANGE_TRIPS[0] <= route_id <= TRONCAL_ROUTE_IDS_RANGE_TRIPS[1]: return "TRONCAL"
    if DUAL_ROUTE_IDS_RANGE_TRIPS[0] <= route_id <= DUAL_ROUTE_IDS_RANGE_TRIPS[1]: return "DUAL"
    if ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0] <= route_id <= ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[1]: return "ZONAL_UCE"
    if ALIMENTADORA_ROUTE_IDS_RANGE_TRIPS[0] <= route_id <= ALIMENTADORA_ROUTE_IDS_RANGE_TRIPS[1]: return "ALIMENTADORA"
    if CABLE_ROUTE_IDS_RANGE_TRIPS[0] <= route_id <= CABLE_ROUTE_IDS_RANGE_TRIPS[1]: return "CABLE"
    return "UNKNOWN" # Should not happen

def get_boarding_time_trips(current_date, day_type_idx):
    time_slots_weights = [
        # (h_start, m_start, h_end, m_end, [prob_weekday, prob_saturday, prob_sunday])
        (0, 0, 3, 59,  [0.02, 0.03, 0.02]),   # Night Owl
        (4, 0, 5, 29,  [0.10, 0.08, 0.05]),   # Early Morning
        (5, 30, 8, 59, [0.30, 0.20, 0.10]),  # Morning Peak
        (9, 0, 16, 29, [0.28, 0.35, 0.43]), # Mid-Day / Off-Peak
        (16, 30, 19, 59,[0.25, 0.25, 0.30]),# Afternoon Peak
        (20, 0, 23, 59,[0.05, 0.09, 0.10]), # Evening
    ]
    slots, weights = zip(*[(s[:4], s[4][day_type_idx]) for s in time_slots_weights])
    chosen_slot = random.choices(slots, weights=weights, k=1)[0]
    
    hour = random.randint(chosen_slot[0], chosen_slot[2])
    minute = random.randint(0, 59)
    if hour == chosen_slot[2]: minute = random.randint(0, chosen_slot[3])
    if hour == chosen_slot[0] and chosen_slot[1] > 0 : minute = random.randint(chosen_slot[1], 59)
    
    return datetime.combine(current_date, time(hour, minute, random.randint(0,59)))

def get_travel_time_seconds_trips(num_stops, route_type):
    time_per_stop = 120
    if route_type == "TRONCAL" or route_type == "DUAL": time_per_stop = random.randint(100, 200)
    elif route_type == "ZONAL_UCE" or route_type == "ALIMENTADORA": time_per_stop = random.randint(70, 160)
    elif route_type == "CABLE": time_per_stop = random.randint(180, 300)
    return num_stops * time_per_stop + random.randint(-30, 90)


# --- Main Generation Loop ---
trip_id_counter = 0
all_trip_records_strings = []
total_trips_generated_for_log = 0

print(f"Generating trips from {SIMULATION_START_DATE} to {SIMULATION_END_DATE}...")

for day_offset in range(SIMULATION_DAYS):
    current_processing_date = SIMULATION_START_DATE + timedelta(days=day_offset)
    weekday_idx = current_processing_date.weekday() # Monday=0, Sunday=6
    
    daily_trip_target = 0
    day_type_for_time = 0 # 0:Weekday, 1:Sat, 2:Sun
    if weekday_idx < 5: daily_trip_target, day_type_for_time = TRIPS_PER_WEEKDAY_SCALED, 0
    elif weekday_idx == 5: daily_trip_target, day_type_for_time = TRIPS_PER_SATURDAY_SCALED, 1
    else: daily_trip_target, day_type_for_time = TRIPS_PER_SUNDAY_OR_HOLIDAY_SCALED, 2
    
    print(f"Generating ~{daily_trip_target} trip legs for {current_processing_date.strftime('%Y-%m-%d')}...")
    
    journeys_initiated_today = 0 # Count distinct journeys, not legs
    
    # Loop until enough journey-initiating trips are made for the day's target
    while journeys_initiated_today < daily_trip_target:
        is_first_leg = True
        current_transfer_group_id_val = str(uuid.uuid4())
        prev_disembark_time = None
        prev_route_type_actual = None
        legs_this_journey = 0

        while legs_this_journey < 3: # Max 2 transfers (3 legs)
            legs_this_journey += 1
            trip_id_counter += 1
            trip_id_val = trip_id_counter

            card_id_val = random.randint(1, ASSUMED_ACTIVE_CARD_THRESHOLD_ID) if random.random() < 0.8 else random.randint(1, MAX_CARD_ID)
            
            route_id_val = None
            current_route_type_actual = None
            fare_id_val = FARE_ID_STANDARD_SITP_TRIPS

            if is_first_leg:
                # Bias route selection for first leg
                rand_route_choice = random.random()
                if rand_route_choice < 0.1 and POPULAR_TRONCAL_ROUTE_IDS_TRIPS : route_id_val = random.choice(POPULAR_TRONCAL_ROUTE_IDS_TRIPS)
                elif rand_route_choice < 0.2 and POPULAR_ZONAL_ROUTE_IDS_TRIPS : route_id_val = random.choice(POPULAR_ZONAL_ROUTE_IDS_TRIPS)
                else: route_id_val = random.randint(1, MAX_ROUTE_ID_TRIPS) # General random choice
                current_route_type_actual = get_route_type_from_id_trips(route_id_val)
                fare_id_val = FARE_ID_STANDARD_CABLE_TRIPS if current_route_type_actual == "CABLE" else FARE_ID_STANDARD_SITP_TRIPS
            else: # Transfer leg
                # Determine new route type based on transfer logic
                # This is simplified, assumes previous_route_type_actual is set
                if prev_route_type_actual in ["ZONAL_UCE", "ALIMENTADORA"]:
                    if random.random() < TRANSFER_TYPE_DIST_TRIPS["ZONAL_TO_ZONAL"]:
                        route_id_val = random.randint(ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0], ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[1])
                    else: # Zonal to Troncal/Cable
                        route_id_val = random.choice(list(range(TRONCAL_ROUTE_IDS_RANGE_TRIPS[0],TRONCAL_ROUTE_IDS_RANGE_TRIPS[1]+1)) + list(range(DUAL_ROUTE_IDS_RANGE_TRIPS[0],DUAL_ROUTE_IDS_RANGE_TRIPS[1]+1)) + list(range(CABLE_ROUTE_IDS_RANGE_TRIPS[0],CABLE_ROUTE_IDS_RANGE_TRIPS[1]+1)))
                elif prev_route_type_actual in ["TRONCAL", "DUAL", "CABLE"]: # Troncal/Cable to Zonal
                    route_id_val = random.randint(ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0], ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[1])
                else: # Fallback
                    route_id_val = random.randint(ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[0], ZONAL_UCE_ROUTE_IDS_RANGE_TRIPS[1])
                
                current_route_type_actual = get_route_type_from_id_trips(route_id_val)
                fare_id_val = FARE_ID_TRANSFER_0_TRIPS if random.random() < 0.7 else FARE_ID_TRANSFER_200_TRIPS # Weighted transfer cost

            vehicle_id_val = random.randint(1, MAX_VEHICLE_ID_TRIPS)
            driver_id_val = random.randint(1, MAX_DRIVER_ID_TRIPS)
            
            conceptual_path = get_conceptual_path_for_route(route_id_val, current_route_type_actual)
            if not conceptual_path or len(conceptual_path) < 1: # Need at least one station for origin/destination
                 if is_first_leg: journeys_initiated_today +=1 # Count as an attempt
                 break # End this journey if path is invalid

            boarding_station_id_val, disembarking_station_id_val = None, None
            boarding_time_obj, disembarking_time_obj = None, None
            num_stops_on_leg = 0

            if is_first_leg:
                boarding_time_obj = get_boarding_time_trips(current_processing_date, day_type_for_time)
                # Board anywhere except the very last station (if path > 1)
                max_boarding_idx = len(conceptual_path) - 2 if len(conceptual_path) > 1 else 0
                boarding_idx = random.randint(0, max_boarding_idx)
                boarding_station_id_val = conceptual_path[boarding_idx]
            else: # Transfer leg
                boarding_time_obj = prev_disembark_time + timedelta(minutes=random.randint(5, TRANSFER_WINDOW_MINUTES_TRIPS - 10))
                boarding_idx = 0 # Start at the conceptual origin of the new route for transfer
                boarding_station_id_val = conceptual_path[boarding_idx]

            # Ensure boarding time is within the current processing day (mostly)
            if boarding_time_obj.date() != current_processing_date:
                if boarding_time_obj > datetime.combine(current_processing_date + timedelta(days=1), time(4,0,0)) or \
                   boarding_time_obj < datetime.combine(current_processing_date, time(0,0,0)):
                    if is_first_leg: journeys_initiated_today +=1
                    break # Invalid time, break journey

            # Disembark at a station after boarding_idx
            if boarding_idx < len(conceptual_path) - 1:
                disembarking_idx = random.randint(boarding_idx + 1, len(conceptual_path) -1)
                disembarking_station_id_val = conceptual_path[disembarking_idx]
                num_stops_on_leg = disembarking_idx - boarding_idx
            else: # Boarded at the last conceptual stop, so disembark there (short/null trip for this leg)
                disembarking_station_id_val = boarding_station_id_val
                num_stops_on_leg = 0
            
            travel_secs = get_travel_time_seconds_trips(num_stops_on_leg, current_route_type_actual)
            disembarking_time_obj = boarding_time_obj + timedelta(seconds=max(60, travel_secs)) # Min 1 min travel time

            # Cap disembark time to avoid spilling too far into next day excessively
            if disembarking_time_obj.date() > current_processing_date and disembarking_time_obj.time() > time(5,0,0):
                disembarking_time_obj = datetime.combine(current_processing_date, time(23,58,0)) + timedelta(seconds=random.randint(0,119))
                if disembarking_time_obj <= boarding_time_obj : disembarking_time_obj = boarding_time_obj + timedelta(minutes=5)


            boarding_time_sql = boarding_time_obj.strftime('%Y-%m-%d %H:%M:%S')
            disembarking_time_sql = disembarking_time_obj.strftime('%Y-%m-%d %H:%M:%S')
            is_transfer_sql = 'TRUE' if not is_first_leg else 'FALSE'

            all_trip_records_strings.append(
                f"({trip_id_val}, {card_id_val}, {vehicle_id_val}, {route_id_val}, {driver_id_val}, "
                f"{boarding_station_id_val}, {disembarking_station_id_val}, '{boarding_time_sql}', '{disembarking_time_sql}', "
                f"{fare_id_val}, {is_transfer_sql}, '{current_transfer_group_id_val}')"
            )
            
            if is_first_leg:
                journeys_initiated_today += 1 # Count this as one main journey initiated for the day's target
            
            prev_disembark_time = disembarking_time_obj
            prev_route_type_actual = current_route_type_actual
            
            if random.random() < PROB_TRANSFER_TRIPS:
                is_first_leg = False # Next leg will be a transfer
            else:
                break # No more transfers for this journey
    
    total_trips_generated_for_log += journeys_initiated_today # Using initiated journeys for daily log
    print(f"Completed day {current_processing_date.strftime('%Y-%m-%d')}. Approx {journeys_initiated_today} journeys initiated. Total trips (legs) so far: {trip_id_counter}")


# --- Write to SQL file ---
try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO trips (trip_id, card_id, vehicle_id, route_id, driver_id, boarding_station_id, disembarking_station_id, boarding_time, disembarking_time, fare_id, is_transfer, transfer_group_id) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_trip_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_trip_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO trips (trip_id, card_id, vehicle_id, route_id, driver_id, boarding_station_id, disembarking_station_id, boarding_time, disembarking_time, fare_id, is_transfer, transfer_group_id) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_trip_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for trips generated successfully: {os.path.abspath(output_file)}")
    print(f"Total trip leg records generated: {len(all_trip_records_strings)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating trips from 2024-06-03 to 2024-06-09...
Generating ~33639 trip legs for 2024-06-03...
Completed day 2024-06-03. Approx 33639 journeys initiated. Total trips (legs) so far: 40838
Generating ~33639 trip legs for 2024-06-04...
Completed day 2024-06-04. Approx 33639 journeys initiated. Total trips (legs) so far: 81518
Generating ~33639 trip legs for 2024-06-05...
Completed day 2024-06-05. Approx 33639 journeys initiated. Total trips (legs) so far: 122267
Generating ~33639 trip legs for 2024-06-06...
Completed day 2024-06-06. Approx 33639 journeys initiated. Total trips (legs) so far: 163142
Generating ~33639 trip legs for 2024-06-07...
Completed day 2024-06-07. Approx 33639 journeys initiated. Total trips (legs) so far: 203872
Generating ~25941 trip legs for 2024-06-08...
Completed day 2024-06-08. Approx 25941 journeys initiated. Total trips (legs) so far: 235322
Generating ~15915 trip legs for 2024-06-09...
Completed day 2024-06-09. Approx 15915 journeys initiated. Total trips (

In [25]:
# Nombre del archivo: generate_realtime_arrivals.py
import faker
import random
import os
from datetime import datetime, timedelta

fake_co = faker.Faker('es_CO')
output_file = "15_insert_realtime_arrivals.sql"

# --- Simulation Time (consistent with previous snapshots) ---
SIMULATION_CURRENT_DATETIME_SNAPSHOT = datetime(2024, 6, 4, 7, 30, 0) # Martes, Junio 4, 2024, 07:30 AM

# --- ID Ranges (from previous scripts) ---
# Stations (Script 09)
MAX_STATION_ID_SNAPSHOT = 7765
PORTAL_IDS_SNAPSHOT = list(range(1, 9 + 1))
TRONCAL_STATIONS_OTHER_SNAPSHOT = list(range(14, 142 + 1))
STATIONS_FOR_ARRIVALS_SNAPSHOT = PORTAL_IDS_SNAPSHOT + random.sample(TRONCAL_STATIONS_OTHER_SNAPSHOT, k=min(30, len(TRONCAL_STATIONS_OTHER_SNAPSHOT)))

# Routes (Script 12)
TRONCAL_ROUTE_IDS_START_SNAPSHOT = 1 
NUM_TRONCAL_ROUTES_GEN_SNAPSHOT = 99
DUAL_ROUTE_IDS_START_SNAPSHOT = TRONCAL_ROUTE_IDS_START_SNAPSHOT + NUM_TRONCAL_ROUTES_GEN_SNAPSHOT
NUM_DUAL_ROUTES_GEN_SNAPSHOT = 5
RELEVANT_ROUTE_IDS_FOR_ARRIVALS_SNAPSHOT = list(range(TRONCAL_ROUTE_IDS_START_SNAPSHOT, TRONCAL_ROUTE_IDS_START_SNAPSHOT + NUM_TRONCAL_ROUTES_GEN_SNAPSHOT)) + \
                                          list(range(DUAL_ROUTE_IDS_START_SNAPSHOT, DUAL_ROUTE_IDS_START_SNAPSHOT + NUM_DUAL_ROUTES_GEN_SNAPSHOT))

# Vehicles (Script 10)
MAX_VEHICLE_ID_SNAPSHOT = 10563

# --- Number of Records ---
NUM_ARRIVALS_TO_GENERATE_TOTAL = random.randint(400, 700) # Total predictions in this snapshot

print(f"Generating ~{NUM_ARRIVALS_TO_GENERATE_TOTAL} realtime arrival predictions around {SIMULATION_CURRENT_DATETIME_SNAPSHOT.strftime('%Y-%m-%d %H:%M:%S')}...")

all_records_strings = []
arrival_id_counter = 0

# Distribute arrivals somewhat evenly among selected stations
arrivals_per_station_target = NUM_ARRIVALS_TO_GENERATE_TOTAL // len(STATIONS_FOR_ARRIVALS_SNAPSHOT)
if arrivals_per_station_target == 0: arrivals_per_station_target = 1


for station_id_val in STATIONS_FOR_ARRIVALS_SNAPSHOT:
    # For each station, show arrivals for a few routes
    num_routes_displaying = random.randint(2, 5)
    
    for _ in range(num_routes_displaying):
        if not RELEVANT_ROUTE_IDS_FOR_ARRIVALS_SNAPSHOT: continue
        route_id_val = random.choice(RELEVANT_ROUTE_IDS_FOR_ARRIVALS_SNAPSHOT)
        
        # For this route at this station, generate 1 or 2 upcoming predictions
        num_predictions_this_route = random.randint(1, 2)
        last_arrival_offset_minutes = 0

        for _ in range(num_predictions_this_route):
            if arrival_id_counter >= NUM_ARRIVALS_TO_GENERATE_TOTAL: # Stop if total reached
                break
            
            arrival_id_counter += 1
            arrival_id_val = arrival_id_counter

            vehicle_id_val = random.randint(1, MAX_VEHICLE_ID_SNAPSHOT)

            # Stagger arrival times for the same route
            current_offset = last_arrival_offset_minutes + random.randint(3 if last_arrival_offset_minutes == 0 else 7, 
                                                                         12 if last_arrival_offset_minutes == 0 else 20)
            estimated_arrival_time_obj = SIMULATION_CURRENT_DATETIME_SNAPSHOT + timedelta(minutes=current_offset)
            estimated_arrival_time_sql = estimated_arrival_time_obj.strftime('%Y-%m-%d %H:%M:%S')
            last_arrival_offset_minutes = current_offset

            status_val = "EXPECTED"
            actual_arrival_time_sql = "NULL"
            
            rand_status_chance = random.random()
            if rand_status_chance < 0.05 and current_offset <= 5: # Just arrived
                status_val = "ARRIVED"
                actual_arrival_obj = estimated_arrival_time_obj - timedelta(seconds=random.randint(0,90))
                actual_arrival_time_sql = f"'{actual_arrival_obj.strftime('%Y-%m-%d %H:%M:%S')}'"
            elif rand_status_chance < 0.10 and current_offset <= 10: # Delayed
                status_val = "DELAYED"
                # Update estimated_arrival_time for delayed buses
                estimated_arrival_time_obj += timedelta(minutes=random.randint(2,6))
                estimated_arrival_time_sql = estimated_arrival_time_obj.strftime('%Y-%m-%d %H:%M:%S')

            # Prediction created a few moments before the snapshot time
            created_at_sql = (SIMULATION_CURRENT_DATETIME_SNAPSHOT - timedelta(seconds=random.randint(10, 180))).strftime('%Y-%m-%d %H:%M:%S')

            all_records_strings.append(
                f"({arrival_id_val}, {station_id_val}, {route_id_val}, {vehicle_id_val}, "
                f"'{estimated_arrival_time_sql}', {actual_arrival_time_sql}, '{status_val}', '{created_at_sql}')"
            )
        if arrival_id_counter >= NUM_ARRIVALS_TO_GENERATE_TOTAL: break
    if arrival_id_counter >= NUM_ARRIVALS_TO_GENERATE_TOTAL: break


try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO realtime_arrivals (arrival_id, station_id, route_id, vehicle_id, estimated_arrival_time, actual_arrival_time, status, created_at) OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO realtime_arrivals (arrival_id, station_id, route_id, vehicle_id, estimated_arrival_time, actual_arrival_time, status, created_at) OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for realtime_arrivals generated successfully: {os.path.abspath(output_file)}")
    print(f"Total realtime_arrival records generated: {len(all_records_strings)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating ~416 realtime arrival predictions around 2024-06-04 07:30:00...
SQL script for realtime_arrivals generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/15_insert_realtime_arrivals.sql
Total realtime_arrival records generated: 210


In [26]:
# Nombre del archivo: generate_route_current_location.py
import faker
import random
import os
from datetime import datetime, timedelta

fake_co = faker.Faker('es_CO')
output_file = "16_insert_route_current_location.sql"

# --- Simulation Time (consistent with realtime_arrivals) ---
SIMULATION_CURRENT_DATETIME_SNAPSHOT = datetime(2024, 6, 4, 7, 30, 0) # Martes, Junio 4, 2024, 07:30 AM

# --- ID Ranges (from previous scripts) ---
# Routes (Script 12)
MAX_ROUTE_ID_SNAPSHOT_LOC = 558 # Total routes generated in script 12
ALL_ROUTE_IDS_SNAPSHOT_LOC = list(range(1, MAX_ROUTE_ID_SNAPSHOT_LOC + 1))

# Vehicles (Script 10)
MAX_VEHICLE_ID_SNAPSHOT_LOC = 10563

# --- Number of Records ---
NUM_VEHICLES_TO_TRACK_SNAPSHOT = random.randint(1000, 1800)

print(f"Generating ~{NUM_VEHICLES_TO_TRACK_SNAPSHOT} route current location records around {SIMULATION_CURRENT_DATETIME_SNAPSHOT.strftime('%Y-%m-%d %H:%M:%S')}...")

all_records_strings = []
location_update_id_counter = 0
# Keep track of vehicles already assigned a location in this snapshot to promote uniqueness
vehicles_with_location_snapshot = set()


for _ in range(NUM_VEHICLES_TO_TRACK_SNAPSHOT):
    location_update_id_counter += 1
    location_update_id_val = location_update_id_counter

    route_id_val = random.choice(ALL_ROUTE_IDS_SNAPSHOT_LOC)
    
    vehicle_id_val = None
    for attempt in range(10): # Try to get a unique vehicle
        temp_vid = random.randint(1, MAX_VEHICLE_ID_SNAPSHOT_LOC)
        if temp_vid not in vehicles_with_location_snapshot:
            vehicle_id_val = temp_vid
            vehicles_with_location_snapshot.add(temp_vid)
            break
    if vehicle_id_val is None: # Fallback if all attempts fail
        vehicle_id_val = random.randint(1, MAX_VEHICLE_ID_SNAPSHOT_LOC)
        
    # Plausible Lat/Long for Bogotá (general area, not specific to route path for this snapshot)
    latitude_val = round(random.uniform(4.40, 4.80), 6)
    longitude_val = round(random.uniform(-74.20, -74.00), 6)
    
    speed_val = round(random.uniform(5, 55), 2) # km/h, assuming moving
    if random.random() < 0.15: # 15% chance vehicle is currently stopped
        speed_val = 0.0
        
    timestamp_obj = SIMULATION_CURRENT_DATETIME_SNAPSHOT - timedelta(seconds=random.randint(5, 90)) # Location reported recently
    timestamp_sql = timestamp_obj.strftime('%Y-%m-%d %H:%M:%S')

    all_records_strings.append(
        f"({location_update_id_val}, {route_id_val}, {vehicle_id_val}, {latitude_val}, {longitude_val}, {speed_val}, '{timestamp_sql}')"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO route_current_location (location_update_id, route_id, vehicle_id, latitude, longitude, speed, \"timestamp\") OVERRIDING SYSTEM VALUE VALUES\n")
        batch_size = 1000
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if (i + 1) % batch_size == 0 and i < len(all_records_strings) - 1:
                file.write(";\n")
                file.write("INSERT INTO route_current_location (location_update_id, route_id, vehicle_id, latitude, longitude, speed, \"timestamp\") OVERRIDING SYSTEM VALUE VALUES\n")
            elif i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for route_current_location generated successfully: {os.path.abspath(output_file)}")
    print(f"Total route_current_location records generated: {len(all_records_strings)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating ~1567 route current location records around 2024-06-04 07:30:00...
SQL script for route_current_location generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/16_insert_route_current_location.sql
Total route_current_location records generated: 1567


In [27]:
# Nombre del archivo: generate_alerts.py
import random
import os
from datetime import datetime, timedelta

output_file = "17_insert_alerts.sql"

# --- ID Ranges (from previous scripts) ---
# Stations (Script 09)
MAX_STATION_ID_ALERTS = 7765
PORTAL_IDS_ALERTS = list(range(1, 9 + 1))
TRONCAL_STATIONS_OTHER_ALERTS = list(range(14, 142 + 1))
RELEVANT_STATION_IDS_FOR_ALERTS_PY = PORTAL_IDS_ALERTS + TRONCAL_STATIONS_OTHER_ALERTS # Keep variable name unique to script

# Routes (Script 12)
MAX_ROUTE_ID_ALERTS = 558 # Total routes from script 12
RELEVANT_ROUTE_IDS_FOR_ALERTS_PY = list(range(1, MAX_ROUTE_ID_ALERTS + 1))

print("Generating alert records contextualized around June 2024...")

alert_templates_list = [
    {"type": "STATION_ISSUE", "severity": "CRITICAL", "msg_template": "Estación {} presenta cierre temporal por novedad ajena a la operación. Rutas realizan desvíos.", "d_hr_min": 2, "d_hr_max": 5, "needs_station": True},
    {"type": "STATION_ISSUE", "severity": "WARNING", "msg_template": "Alta afluencia de usuarios en estación {}. Considere mayor tiempo de espera.", "d_hr_min": 1, "d_hr_max": 3, "needs_station": True},
    {"type": "STATION_ISSUE", "severity": "INFO", "msg_template": "Mantenimiento de ascensor en estación {}. Personal en sitio disponible.", "d_hr_min": 24, "d_hr_max": 72, "needs_station": True},
    {"type": "ROUTE_DELAY", "severity": "WARNING", "msg_template": "Ruta {} con demoras de hasta 20 min por alta congestión vehicular en sector Chapinero.", "d_hr_min": 1, "d_hr_max": 3, "needs_route": True},
    {"type": "ROUTE_INFO", "severity": "INFO", "msg_template": "Ruta {} operará con desvío por ciclovía nocturna el Jueves.", "d_hr_min": 6, "d_hr_max": 6, "needs_route": True, "future_event": True, "event_day_offset": 3}, # Assuming current day is Monday for planning
    {"type": "SYSTEM_WIDE", "severity": "INFO", "msg_template": "Recuerde personalizar y recargar su tarjeta Tullave para acceder a beneficios.", "d_hr_min": 24*10, "d_hr_max": 24*20, "is_general": True},
    {"type": "SERVICE_INFO", "severity": "INFO", "msg_template": "Servicio de cicloparqueaderos disponible. Consulte cupos en nuestra app.", "d_hr_min": 24*5, "d_hr_max": 24*15, "is_general": True},
    {"type": "SYSTEM_WIDE", "severity": "WARNING", "msg_template": "Se prevén lluvias para la tarde. Planee su viaje.", "d_hr_min": 4, "d_hr_max": 8, "is_general": True}
]

NUM_ALERTS_TO_GENERATE_PY = 30
all_records_strings = []
alert_id_counter = 0

# Contexto temporal para las alertas (alrededor de Junio 2024)
alert_context_start_date = datetime(2024, 5, 25)
alert_context_end_date = datetime(2024, 6, 15)

for i in range(NUM_ALERTS_TO_GENERATE_PY):
    alert_id_counter += 1
    alert_id_val = alert_id_counter
    template = random.choice(alert_templates_list)

    station_id_sql = "NULL"
    route_id_sql = "NULL"
    message_text = template["msg_template"]

    if template.get("needs_station") and RELEVANT_STATION_IDS_FOR_ALERTS_PY:
        s_id = random.choice(RELEVANT_STATION_IDS_FOR_ALERTS_PY)
        station_id_sql = str(s_id)
        message_text = template["msg_template"].format(f"Est.ID {s_id}")
    elif template.get("needs_route") and RELEVANT_ROUTE_IDS_FOR_ALERTS_PY:
        r_id = random.choice(RELEVANT_ROUTE_IDS_FOR_ALERTS_PY)
        route_id_sql = str(r_id)
        message_text = template["msg_template"].format(f"Ruta ID {r_id}")
    
    message_sql = message_text.replace("'", "''")
    severity_sql = template["severity"]
    alert_type_sql = template["type"]

    duration_h = random.randint(template["d_hr_min"], template["d_hr_max"])
    
    if template.get("future_event"):
        # Starts a few days from a random point in our context window
        base_day_for_future = alert_context_start_date + timedelta(days=random.randint(0,10))
        start_ts_obj = base_day_for_future + timedelta(days=template.get("event_day_offset", 1), hours=random.randint(6,10))
    else:
        # Starts at a random time within our context window
        start_ts_obj = alert_context_start_date + timedelta(
            days=random.randint(0, (alert_context_end_date - alert_context_start_date).days),
            hours=random.randint(0,23), minutes=random.randint(0,59)
        )
    start_ts_sql = start_ts_obj.strftime('%Y-%m-%d %H:%M:%S')

    end_ts_sql = "NULL"
    if random.random() < 0.7: # 70% of alerts have an end time
        end_ts_obj = start_ts_obj + timedelta(hours=duration_h)
        end_ts_sql = f"'{end_ts_obj.strftime('%Y-%m-%d %H:%M:%S')}'"
    
    all_records_strings.append(
        f"({alert_id_val}, '{message_sql}', '{severity_sql}', '{alert_type_sql}', "
        f"'{start_ts_sql}', {end_ts_sql}, {station_id_sql}, {route_id_sql})"
    )

try:
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("INSERT INTO alerts (alert_id, message, severity, alert_type, start_timestamp, end_timestamp, station_id, route_id) OVERRIDING SYSTEM VALUE VALUES\n")
        for i, record_string in enumerate(all_records_strings):
            file.write(record_string)
            if i < len(all_records_strings) - 1:
                file.write(",\n")
            else:
                file.write(";\n")
    print(f"SQL script for alerts generated successfully: {os.path.abspath(output_file)}")
    print(f"Total alert records generated: {len(all_records_strings)}")
except IOError as e:
    print(f"Error writing to file {output_file}: {e}")

Generating alert records contextualized around June 2024...
SQL script for alerts generated successfully: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/17_insert_alerts.sql
Total alert records generated: 30
