# Data Generation and Insertion for Transport Database

This notebook replaces the old SQL insert scripts with Python scripts for generating and inserting synthetic data into the database, using the new English schema. It uses Faker and pandas for data generation and manipulation.

In [1]:
import faker
import random
from datetime import datetime, timedelta
import os

# Initialize Faker for Colombian Spanish (to keep proper nouns in Spanish)
# Using 'es_CO' for names, cities, addresses if bogota_address is not specific enough
fake_co = faker.Faker('es_CO')
# For more diverse international-looking names if needed for some users, though 'es_CO' is primary
fake_generic = faker.Faker() # Can add other locales like fake_en = faker.Faker('en_US')


def generate_bogota_address():
    """Generates a more plausible Bogota-style address."""
    street_type = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
    street_number = random.randint(1, 200)
    
    # Primary number, letter (optional), secondary number, complement (optional)
    part1 = random.randint(1, 150)
    part1_letter = random.choice(["", "A", "B", "C", "Bis"]) if random.random() > 0.5 else ""
    part2 = random.randint(1, 99)
    part3_complement = random.choice(["", f" Interior {random.randint(1,10)}", f" Apartamento {random.randint(100,1000)}", f" Oficina {random.randint(10,50)}"]) if random.random() > 0.7 else ""
    
    address_detail = f"{part1}{part1_letter} # {part2}-{random.randint(1,50)}"
    
    # Common neighborhoods in Bogota for added realism if desired, though Faker's city might be enough
    # neighborhoods = ["Chapinero", "Usaquén", "Suba", "Engativá", "Fontibón", "Kennedy", "Bosa", "Ciudad Bolívar", "Teusaquillo", "Barrios Unidos"]
    # neighborhood_detail = f", {random.choice(neighborhoods)}" if random.random() > 0.5 else ""
    
    return f"{street_type} {street_number} {address_detail}"


# --- Configuration ---
output_folder = "generated_sql_scripts"
users_output_file = os.path.join(output_folder, "1_insert_users.sql")
cards_output_file = os.path.join(output_folder, "2_insert_cards.sql")

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

# Number of records - using ranges for realism
# Target ~2.5M users
num_users = random.randint(24500, 25500)

# Target ~2M active cards (some users might have more than one card over time, but let's start with one active per user mostly)
# Let's make it so that most users get one active card.
# Total cards will be slightly more to account for some inactive/lost cards.
num_total_cards = random.randint(23000, 24000)
min_active_cards_target = 20000


# --- Lists for Data Generation ---
genders_list = ['M', 'F', 'O'] # O for Other

# --- Generate Users ---
print(f"Generating {num_users} users...")
user_ids_generated = [] # To keep track of generated user_ids for card assignment

with open(users_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO users (user_id, first_name, last_name, contact_number, email, gender, date_of_birth, residential_address, id_number, city_of_birth, registration_date) VALUES\n")

    batch_size = 1000 # Insert 1000 records at a time then restart VALUES
    for i in range(num_users):
        user_id = i + 1 # Assuming user_id starts from 1 and is sequential for this script
        user_ids_generated.append(user_id)

        # Mix of Colombian and more generic names for wider appearance
        if random.random() < 0.85: # 85% Colombian-style names
            first_name_val = fake_co.first_name().replace("'", "''")
            last_name_val = fake_co.last_name().replace("'", "''")
        else:
            first_name_val = fake_generic.first_name().replace("'", "''")
            last_name_val = fake_generic.last_name().replace("'", "''")
            
        contact_number_val = fake_co.phone_number() # Colombian format
        email_val = fake_co.unique.email() # Ensure unique email
        gender_val = random.choice(genders_list)
        
        birth_date_obj = fake_co.date_of_birth(minimum_age=16, maximum_age=85)
        date_of_birth_val = birth_date_obj.strftime('%Y-%m-%d')
        
        residential_address_val = generate_bogota_address().replace("'", "''")
        # Generate a unique national ID number (Cédula)
        id_number_val = str(fake_co.unique.random_number(digits=10, fix_len=True))
        
        city_of_birth_val = fake_co.city().replace("'", "''") # Colombian city
        
        registration_date_obj = fake_co.date_between(start_date='-10y', end_date='today')
        registration_date_val = registration_date_obj.strftime('%Y-%m-%d')
        
        # SQL formatting
        file.write(f"({user_id}, '{first_name_val}', '{last_name_val}', '{contact_number_val}', '{email_val}', '{gender_val}', '{date_of_birth_val}', '{residential_address_val}', '{id_number_val}', '{city_of_birth_val}', '{registration_date_val}')")

        if (i + 1) % batch_size == 0 and i < num_users -1:
            file.write(";\n")
            file.write("INSERT INTO users (user_id, first_name, last_name, contact_number, email, gender, date_of_birth, residential_address, id_number, city_of_birth, registration_date) VALUES\n")
        elif i < num_users - 1:
            file.write(",\n")
        else:
            file.write(";\n")
            
print(f"SQL script for users generated: {os.path.abspath(users_output_file)}")


# --- Generate Cards ---
print(f"Generating {num_total_cards} cards (aiming for at least {min_active_cards_target} active)...")
active_cards_count = 0
card_statuses = ['active', 'inactive', 'blocked', 'lost']

with open(cards_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO cards (card_id, card_number, user_id, acquisition_date, status, balance, last_used_date, update_date) VALUES\n")
    
    assigned_users_for_cards = set() # To ensure a user gets at least one card if possible

    for i in range(num_total_cards):
        card_id = i + 1 # Assuming card_id starts from 1
        
        # Generate a unique card number (can be more complex if needed)
        card_number_val = str(fake_co.unique.random_number(digits=16, fix_len=True))
        
        user_id_val = None
        # Try to assign to a user who doesn't have many cards yet or ensure enough users get one
        if user_ids_generated:
            if len(assigned_users_for_cards) < len(user_ids_generated) and active_cards_count < min_active_cards_target :
                 # Prioritize users who haven't been assigned a card yet for active cards
                potential_users = [uid for uid in user_ids_generated if uid not in assigned_users_for_cards]
                if potential_users:
                    user_id_val = random.choice(potential_users)
                    assigned_users_for_cards.add(user_id_val)
                else: # all users got one, assign randomly
                    user_id_val = random.choice(user_ids_generated)
            else: # Random assignment after targets are met or if all users have one
                 user_id_val = random.choice(user_ids_generated)


        if user_id_val is None: # Fallback if no users somehow (should not happen with num_users > 0)
            user_id_val = "NULL"


        acquisition_date_obj = fake_co.date_between(start_date='-8y', end_date='today') # Card acquired after user registration potentially
        acquisition_date_val = acquisition_date_obj.strftime('%Y-%m-%d')
        
        status_val = 'inactive' # Default
        if active_cards_count < min_active_cards_target:
            # Higher chance of being active until target is met
            status_val = random.choices(card_statuses, weights=[0.9, 0.05, 0.03, 0.02], k=1)[0]
        else:
            # Normal distribution after target
            status_val = random.choices(card_statuses, weights=[0.7, 0.15, 0.1, 0.05], k=1)[0]
        
        if status_val == 'active':
            active_cards_count += 1
            
        balance_val = 0.0
        if status_val == 'active' and random.random() < 0.8: # 80% of active cards have some balance
            balance_val = round(random.uniform(1000, 50000) / 50) * 50 # Multiples of 50 COP
            
        last_used_date_val = "NULL"
        if status_val == 'active' and random.random() < 0.9: # 90% of active cards have been used
            last_used_datetime_obj = fake_co.date_time_between(start_date=acquisition_date_obj, end_date='now', tzinfo=None)
            last_used_date_val = f"'{last_used_datetime_obj.strftime('%Y-%m-%d %H:%M:%S')}'"
            
        update_date_obj = fake_co.date_between(start_date=acquisition_date_obj, end_date='today')
        update_date_val = update_date_obj.strftime('%Y-%m-%d')

        file.write(f"({card_id}, '{card_number_val}', {user_id_val}, '{acquisition_date_val}', '{status_val}', {balance_val}, {last_used_date_val}, '{update_date_val}')")

        if (i + 1) % batch_size == 0 and i < num_total_cards -1:
            file.write(";\n")
            file.write("INSERT INTO cards (card_id, card_number, user_id, acquisition_date, status, balance, last_used_date, update_date) VALUES\n")
        elif i < num_total_cards - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for cards generated: {os.path.abspath(cards_output_file)}")
print(f"Total active cards generated: {active_cards_count}")

Generating 25166 users...
SQL script for users generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/1_insert_users.sql
Generating 23202 cards (aiming for at least 20000 active)...
SQL script for cards generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/2_insert_cards.sql
Total active cards generated: 20643


In [2]:
import faker
import random
from datetime import datetime
import os
import unicodedata

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts" # Defined in the previous script
locations_output_file = os.path.join(output_folder, "3_insert_locations.sql")
recharge_points_output_file = os.path.join(output_folder, "4_insert_recharge_points.sql")

# Ensure output directory exists (though previous script should have created it)
os.makedirs(output_folder, exist_ok=True)

# Number of records
num_locations = 13 # Based on "13 Zonas de Operación" [cite: 28]
# Based on "4,864 puntos de recarga externos" [cite: 28]
num_recharge_points = random.randint(4800, 4900)

# Function to generate a more plausible Bogota-style address (can be shared across scripts)
def generate_bogota_address_simple():
    """Generates a simplified plausible Bogota-style address for recharge points."""
    street_type = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
    street_number = random.randint(1, 200)
    part1 = random.randint(1, 150)
    part2 = random.randint(1, 99)
    address_detail = f"{part1} # {part2}-{random.randint(1,50)}"
    return f"{street_type} {street_number} {address_detail}"

def slugify(text):
    """Converts text to a simple slug."""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    text = text.lower().replace(' ', '_').replace('.', '').replace(',', '')
    return text


# --- Generate Locations (Zonas de Operación) ---
print(f"Generating {num_locations} locations (operational zones)...")

# Names based on zones mentioned or implied in the document [cite: 25]
# Tunal, Sur, Américas, Calle 80, Norte, Suba, Usme, Engativá, San Cristóbal,
# Usaquén, Fontibón, Kennedy, Ciudad Bolívar. We need 13.
# Some from concessionaire zones[cite: 25]: Engativá, San Cristóbal, Usaquén, Calle 80, Tintal Zona Franca (use Fontibón for this), Bosa, Suba Oriental, Kennedy, Ciudad Bolívar, Fontibón, Usme, Suba Centro, Perdomo.
# Let's refine the list to 13 distinct major zones often referenced.
location_names_base = [
    "Usaquén", "Chapinero", "Santa Fe", "San Cristóbal", "Usme", "Tunjuelito",
    "Bosa", "Kennedy", "Fontibón", "Engativá", "Suba", "Barrios Unidos", "Teusaquillo"
    # "Puente Aranda", "Los Mártires", "Antonio Nariño", "Ciudad Bolívar", "Sumapaz" # Other Localidades
]
if len(location_names_base) < num_locations:
    location_names_base.extend([f"Zona Operativa {i+1}" for i in range(num_locations - len(location_names_base))])
elif len(location_names_base) > num_locations:
    location_names_base = random.sample(location_names_base, num_locations)


location_ids_generated = []

with open(locations_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO locations (location_id, name, description) VALUES\n")
    batch_size = 1000

    for i in range(num_locations):
        location_id = i + 1 # Assuming location_id starts from 1
        location_ids_generated.append(location_id)
        
        name_val = location_names_base[i].replace("'", "''")
        description_val = f"Zona de operación {name_val} en Bogotá.".replace("'", "''")
        
        file.write(f"({location_id}, '{name_val}', '{description_val}')")
        
        if (i + 1) % batch_size == 0 and i < num_locations -1 :
            file.write(";\n")
            file.write("INSERT INTO locations (location_id, name, description) VALUES\n")
        elif i < num_locations - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for locations generated: {os.path.abspath(locations_output_file)}")


# --- Generate Recharge Points ---
print(f"Generating {num_recharge_points} recharge points...")

recharge_point_operators = ["PuntoRed", "SuRed", "MoviiRed", "PagaTodo", "Station Kiosk", "Online Platform"]

with open(recharge_points_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO recharge_points (recharge_point_id, name, address, latitude, longitude, location_id, operator) VALUES\n")
    batch_size = 1000

    for i in range(num_recharge_points):
        recharge_point_id = i + 1 # Assuming recharge_point_id starts from 1
        
        # Generate a plausible name for the recharge point
        point_type = random.choice(["Tienda", "Papelería", "Droguería", "Miscelánea", "Kiosko Estación", "Plataforma Web"])
        base_name_for_point = fake_co.company().split(' ')[0].replace(',', '') + " " + fake_co.street_name().split(' ')[-1]
        name_val = f"{point_type} {base_name_for_point}".replace("'", "''")
        if "Plataforma Web" in name_val:
            name_val = "Plataforma de Recarga Online TransMilenio" # Make it more unique if it's an online platform
            address_val = "NULL" # No physical address for online
            # Bogota's general coordinates for online services or use NULL
            latitude_val = "NULL"
            longitude_val = "NULL"
            operator_val = "Online Platform"
        else:
            address_val = f"'{generate_bogota_address_simple().replace("'", "''")}'"
            # Generate Lat/Long for Bogotá (approximate bounds)
            # Lat: 4.4 to 4.8, Lon: -74.0 to -74.2
            latitude_val = round(random.uniform(4.400000, 4.800000), 6)
            longitude_val = round(random.uniform(-74.200000, -74.000000), 6)
            operator_val = random.choice(recharge_point_operators)


        location_id_val = random.choice(location_ids_generated) if location_ids_generated else "NULL"
        
        file.write(f"({recharge_point_id}, '{name_val}', {address_val}, {latitude_val}, {longitude_val}, {location_id_val}, '{operator_val}')")

        if (i + 1) % batch_size == 0 and i < num_recharge_points -1 :
            file.write(";\n")
            file.write("INSERT INTO recharge_points (recharge_point_id, name, address, latitude, longitude, location_id, operator) VALUES\n")
        elif i < num_recharge_points - 1:
            file.write(",\n")
        else:
            file.write(";\n")
            
print(f"SQL script for recharge points generated: {os.path.abspath(recharge_points_output_file)}")

Generating 13 locations (operational zones)...
SQL script for locations generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/3_insert_locations.sql
Generating 4834 recharge points...
SQL script for recharge points generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/4_insert_recharge_points.sql


In [4]:
import faker
import random
from datetime import datetime, timedelta
import os
import uuid

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts" # Defined in previous scripts
recharges_output_file = os.path.join(output_folder, "5_insert_recharges.sql")

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

# Max IDs from previous scripts (using upper end of their generation ranges)
# From 2_insert_cards.sql: num_total_cards = random.randint(2300000, 2400000)
max_card_id = 24000
# From 2_insert_cards.sql: min_active_cards_target = 2000000 (these cards should see more activity)
assumed_active_card_threshold_id = 20000

# From 4_insert_recharge_points.sql: num_recharge_points = random.randint(4800, 4900)
max_recharge_point_id = 4900
# Assume one of the recharge points was the "Online Platform" as per logic in 4_insert_recharge_points
# For simplicity, let's assume the last ID could be an online one, or a specific known ID if designated.
# For now, we'll treat all physical points similarly popular, with online being an option.

# Number of recharge records
num_recharges = random.randint(65000, 80000) # e.g., 7 million recharges (real file estimate)

# Common recharge amounts in COP
recharge_amounts_cop = [
    3000, 3200, 3950, 7000, 10000, 11800, 15000, 20000, 23600, 30000, 70000, 50000, 100000
]
# Weights for these amounts (e.g., 10000 and 20000 are very common)
recharge_amounts_weights = [
    10, 5, 10, 20, 5, 25, 10, 5, 20, 5, 3, 10, 2
]


# --- Generate Recharges ---
print(f"Generating {num_recharges} recharge records...")

with open(recharges_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO recharges (recharge_id, card_id, recharge_point_id, amount, recharge_timestamp, transaction_id) VALUES\n")
    
    batch_size = 1000 # SQL statements per batch

    for i in range(num_recharges):
        recharge_id = i + 1 # Assuming recharge_id starts from 1

        # Select card_id: Prioritize "active" cards
        if random.random() < 0.85 and assumed_active_card_threshold_id > 0 : # 85% of recharges go to presumed active cards
            card_id_val = random.randint(1, assumed_active_card_threshold_id)
        else:
            card_id_val = random.randint(1, max_card_id)

        # Select recharge_point_id
        # Give a slight preference to non-online points for more "physical" transactions
        if random.random() < 0.05: # 5% chance of using the "Online Platform" (assuming it's the last ID or a known one)
            # If the online platform was the last ID in recharge_points script:
            recharge_point_id_val = max_recharge_point_id
            # Or if it had a specific name/ID, you'd target that. For now, last ID is a placeholder.
        else:
            recharge_point_id_val = random.randint(1, max_recharge_point_id -1 if max_recharge_point_id > 1 else 1)


        amount_val = random.choices(recharge_amounts_cop, weights=recharge_amounts_weights, k=1)[0]

        # Simulate card acquisition date (consistent with card generation logic)
        # Cards are acquired between 8 years ago and today.
        # A card's recharges must happen after its acquisition.
        # For simplicity, we simulate an acquisition window for each recharge event.
        # A more complex approach would be to pre-generate acquisition dates for all cards.
        # Here, we assume older cards (lower IDs) could have been acquired earlier.
        years_ago_for_card = min(8, max(1, int(8 * (card_id_val / max_card_id)))) # Rough estimate: newer cards acquired more recently
        
        try:
            # Simulate a plausible acquisition date for this specific card_id_val
            # To ensure recharge_timestamp is after this.
            simulated_acquisition_date = datetime.now() - timedelta(days=random.randint(30, years_ago_for_card * 365))
            if simulated_acquisition_date > datetime.now() - timedelta(days=30): # Ensure it's at least 30 days old
                simulated_acquisition_date = datetime.now() - timedelta(days=30)

            recharge_datetime_obj = fake_co.date_time_between(start_date=simulated_acquisition_date, end_date='now', tzinfo=None)
        except: # Fallback if date ranges are problematic
             recharge_datetime_obj = fake_co.date_time_between(start_date='-5y', end_date='now', tzinfo=None)

        recharge_timestamp_val = recharge_datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
        
        transaction_id_val = str(uuid.uuid4())

        # SQL formatting
        file.write(f"({recharge_id}, {card_id_val}, {recharge_point_id_val}, {amount_val}, '{recharge_timestamp_val}', '{transaction_id_val}')")

        if (i + 1) % batch_size == 0 and i < num_recharges - 1:
            file.write(";\n")
            file.write("INSERT INTO recharges (recharge_id, card_id, recharge_point_id, amount, recharge_timestamp, transaction_id) VALUES\n")
        elif i < num_recharges - 1:
            file.write(",\n")
        else:
            file.write(";\n")
            
print(f"SQL script for recharges generated: {os.path.abspath(recharges_output_file)}")

Generating 71783 recharge records...
SQL script for recharges generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/5_insert_recharges.sql


In [5]:
import faker
import random
import os
import unicodedata

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts"
concessionaires_output_file = os.path.join(output_folder, "6_insert_concessionaires.sql")
depots_output_file = os.path.join(output_folder, "7_insert_depots.sql")

os.makedirs(output_folder, exist_ok=True)

# Max location_id from 3_insert_locations.sql (assuming 13 locations were generated)
max_location_id = 13

# Helper function (if not already defined in a shared utility)
def generate_bogota_address_simple():
    street_type = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
    street_number = random.randint(1, 200)
    part1 = random.randint(1, 150)
    part2 = random.randint(1, 99)
    address_detail = f"{part1} # {part2}-{random.randint(1,50)}"
    return f"{street_type} {street_number} {address_detail}"

# --- Generate Concessionaires ---
print("Generating concessionaire records...")

# Data extracted and consolidated from PDF page 7 [cite: 25] and page summary [cite: 24]
concessionaire_data = [
    {"name": "Bogotá Móvil Operación Sur BMO SUR S.A.S", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "Connexion Móvil S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "Capitalbus S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "SI18 Calle 80 S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Marked for Troncal and UCE (from ☑ ✓)
    {"name": "SI18 Norte S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "SI18 Suba S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False}, # Marked for Troncal
    {"name": "Somos Bogotá Usme S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "Gmovil S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"name": "Consorcio Express S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False}, # Operates in multiple zones/types
    {"name": "Este Es Mi Bus S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"name": "ETIB S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"name": "Masivo Capital S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False}, # Operates in multiple zones/types
    {"name": "Organización Suma S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"name": "E-Somos Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group
    {"name": "Mueve Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group
    {"name": "ZMO Fontibón III S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group
    {"name": "ZMO Fontíbón V S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group
    {"name": "Emasivo 10 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group (Suba Centro UF 10)
    {"name": "Emasivo 16 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # From Gran Americas Fontibon I group (Suba Centro UF 16)
    {"name": "Operadora Distrital de Transporte La Rolita", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"name": "E-Somos Alimentación S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"name": "Gran Américas Usme S.A.S", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Assuming UCE based on context if not specified
    {"name": "Mueve Usme S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False}, # Assuming UCE
    # Need to ensure we have 27 unique concessionaires as per document [cite: 24]
    # The list above is 23. Adding a few more generic ones or specific ones if details are missed.
    # Let's add a Cable operator explicitly
    {"name": "Cable Movil de Bogota S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": False, "cable": True},
    # Adding a few more zonal operators to reach near 27 count if needed, or assume some listed are distinct enough.
    # For now, we have 24. Let's assume some from "Gran Americas Fontibón I S.A.S." group are distinct enough.
    # If the table on P7 is exhaustive of names, we might have fewer unique named entities if some are operational names vs legal.
    # The PDF states "Actualmente, el sistema cuenta con 27 concesionarios de operación" [cite: 24]
    # The table lists rows which sometimes repeat names for different zones. My list has 24 distinct names.
    # I will add 3 more generic ones to match the 27 count.
    {"name": "Transportes Urbanos Integrados S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"name": "Movilidad Estratégica del Oriente S.A.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"name": "Conexión Capital S.P.A.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
]
# Ensure we have exactly 27 for the script as per doc, or use the distinct ones from the table if preferred.
# For this script, I will use the 27 from the list above.
num_concessionaires_to_generate = len(concessionaire_data)
concessionaire_ids_generated = []


with open(concessionaires_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO concessionaires (concessionaire_id, name, operates_troncal, operates_zonal_uce, operates_zonal_alimentacion, operates_cable) VALUES\n")
    batch_size = 1000

    for i, data in enumerate(concessionaire_data):
        concessionaire_id = i + 1
        concessionaire_ids_generated.append(concessionaire_id)
        
        name_val = data["name"].replace("'", "''")
        troncal_val = data["troncal"]
        zonal_uce_val = data["zonal_uce"]
        zonal_alim_val = data["zonal_alimentacion"]
        cable_val = data["cable"]
        
        file.write(f"({concessionaire_id}, '{name_val}', {troncal_val}, {zonal_uce_val}, {zonal_alim_val}, {cable_val})")
        
        if (i + 1) % batch_size == 0 and i < num_concessionaires_to_generate -1:
            file.write(";\n")
            file.write("INSERT INTO concessionaires (concessionaire_id, name, operates_troncal, operates_zonal_uce, operates_zonal_alimentacion, operates_cable) VALUES\n")
        elif i < num_concessionaires_to_generate - 1:
            file.write(",\n")
        else:
            file.write(";\n")
            
print(f"SQL script for concessionaires generated: {os.path.abspath(concessionaires_output_file)}")

# --- Generate Depots ---
print("Generating depot records...")

depot_specs = [
    {"type": "TALLER", "count": 13, "name_prefix": "Patio Taller Principal"},
    {"type": "TRANSITORIO", "count": 32, "name_prefix": "Patio Transitorio"},
    {"type": "ELECTRICO", "count": 9, "name_prefix": "ElectroPatio"},
    {"type": "BAJAS_EMISIONES", "count": 4, "name_prefix": "Patio Eco"}
]
total_depots_to_generate = sum(spec["count"] for spec in depot_specs)
depot_id_counter = 0

with open(depots_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO depots (depot_id, name, address, depot_type, capacity_vehicles, location_id, concessionaire_id) VALUES\n")
    batch_size = 1000
    first_entry_in_batch = True

    for spec in depot_specs:
        for i in range(spec["count"]):
            depot_id_counter += 1
            depot_id = depot_id_counter
            
            # Generate a unique name for the depot
            zone_name_part = fake_co.city_suffix() # e.g., "Norte", "Sur", or a random word
            name_val = f"{spec['name_prefix']} {zone_name_part} {i+1}".replace("'", "''")
            
            address_val = generate_bogota_address_simple().replace("'", "''")
            depot_type_val = spec["type"]
            capacity_vehicles_val = random.randint(50, 300) # General capacity
            if depot_type_val == "TALLER":
                capacity_vehicles_val = random.randint(150, 500)
            elif depot_type_val == "ELECTRICO":
                capacity_vehicles_val = random.randint(80, 250)
                
            location_id_val = random.randint(1, max_location_id) if max_location_id > 0 else "NULL"
            
            # Assigning concessionaire_id can be random or based on some logic (e.g., type of depot)
            # For now, random assignment, or could be NULL if not exclusively used.
            concessionaire_id_val = random.choice(concessionaire_ids_generated) if concessionaire_ids_generated and random.random() > 0.3 else "NULL"

            if not first_entry_in_batch:
                file.write(",\n")
            else:
                first_entry_in_batch = False

            file.write(f"({depot_id}, '{name_val}', '{address_val}', '{depot_type_val}', {capacity_vehicles_val}, {location_id_val}, {concessionaire_id_val})")

            if depot_id_counter % batch_size == 0 and depot_id_counter < total_depots_to_generate:
                file.write(";\n")
                file.write("INSERT INTO depots (depot_id, name, address, depot_type, capacity_vehicles, location_id, concessionaire_id) VALUES\n")
                first_entry_in_batch = True
    
    if depot_id_counter > 0 and depot_id_counter % batch_size != 0 : # Ensure the last batch ends with a semicolon
         file.write(";\n")
    elif depot_id_counter == 0: # Handle empty file case
        file.seek(0)
        file.truncate()


print(f"SQL script for depots generated: {os.path.abspath(depots_output_file)}")

Generating concessionaire records...
SQL script for concessionaires generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/6_insert_concessionaires.sql
Generating depot records...
SQL script for depots generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/7_insert_depots.sql


In [6]:
import faker
import random
import os
import unicodedata

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts"
stations_output_file = os.path.join(output_folder, "8_insert_stations.sql")

os.makedirs(output_folder, exist_ok=True)

# Max location_id from 3_insert_locations.sql (assuming 13 locations were generated)
max_location_id = 13

# Station counts from PDF
NUM_PORTALS = 9
NUM_CABLE_STATIONS = 4
TOTAL_STATIONS_INC_CABLE = 142 # Includes portals and cable stations [cite: 14, 28]
NUM_TRONCAL_STATIONS_OTHER = TOTAL_STATIONS_INC_CABLE - NUM_PORTALS - NUM_CABLE_STATIONS # Should be 129
NUM_ZONAL_PARADEROS = 7623 # [cite: 19, 30]

NUM_STATIONS_WITH_CYCLE_PARKING = 27 # [cite: 29]
TOTAL_CYCLE_PARKING_SPOTS = 7351 # [cite: 29]

# Station Names from PDF (page 16, 17) [cite: 66, 68, 71]
portal_names_pdf = [
    "Portal Américas", "Cabecera Autopista Norte", "Portal Suba", "Cabecera Calle 80",
    "Portal Sur - JFK Cooperativa Financiera", "Portal Eldorado", "Portal Tunal", "Portal 20 de Julio", "Cabecera Usme"
]
# Ensure exactly NUM_PORTALS
if len(portal_names_pdf) > NUM_PORTALS:
    portal_names_pdf = portal_names_pdf[:NUM_PORTALS]
elif len(portal_names_pdf) < NUM_PORTALS:
    for i in range(NUM_PORTALS - len(portal_names_pdf)):
        portal_names_pdf.append(f"Portal Principal {chr(65+i)}")


troncal_station_names_pdf = [
    "SAN MATEO - C.C. UNISUR", "Calle 100 - Marketmedios", "Banderas", "Avenida Jiménez Centro", # Changed "Avenida Jimenez" to be more specific
    "Toberín - Foundever", "Calle 76 - San Felipe", "TERREROS", "Calle 57 - Tecnoparque Sena", "Alcalá - Colegio Virgen del Pilar", "Calle 45 - American School Way",
    "León XIII", "Despensa", "Bosa Estación" # Soacha stations, Bosa adapted
]
# Add more common Troncal station names patterns
generic_troncal_street_names = [f"Calle {i}" for i in range(10, 200, 5)]
generic_troncal_av_names = [f"Avenida {name}" for name in ["Chile", "Caracas", "NQS Central", "Ciudad de Cali", "Boyacá", "El Dorado"]]
generic_troncal_landmarks = ["Universidades - CityU", "Museo Nacional", "CAD", "Paloquemao", "Ricaurte", "Sabana", "Profamilia", "Marly", "Flores"]

# TransMiCable station names (actual names for realism)
cable_station_names = ["Portal Tunal - Cable", "Juan Pablo II - Cable", "Manitas - Cable", "Mirador del Paraíso - Cable"]

# Helper function
def generate_bogota_address_simple():
    street_type = random.choice(["Calle", "Carrera", "Avenida", "Transversal", "Diagonal"])
    street_number = random.randint(1, 200)
    part1 = random.randint(1, 150)
    part2 = random.randint(1, 99)
    address_detail = f"{part1} # {part2}-{random.randint(1,50)}"
    return f"{street_type} {street_number} {address_detail}"

def generate_station_code(station_type, counter, zone_prefix=None):
    if station_type == "PORTAL":
        return f"P{str(counter).zfill(2)}"
    elif station_type == "CABLE":
        return f"TC{str(counter).zfill(2)}"
    elif station_type.startswith("TRONCAL"):
        # Use a letter prefix for troncales (e.g., A-NQS, B-Autonorte, C-Suba, etc.)
        # This is a simplification; real codes are more complex.
        line_letter = chr(65 + random.randint(0, 10)) # A-K for different conceptual lines
        return f"{line_letter}{str(counter).zfill(2)}"
    elif station_type == "ZONAL_PARADERO":
        # Mimic "206A03" [cite: 107]
        part1 = zone_prefix if zone_prefix else str(random.randint(100,999))
        part2 = chr(65 + random.randint(0,25)) # Random letter
        part3 = str(counter).zfill(2) # Counter within that zone/letter combo (will not be unique globally here this way, needs global counter for part3 or make part1/part2 more unique)
                                        # For script simplicity, we'll make counter global for this part
        return f"{part1}{part2}{str(counter % 100).zfill(2)}" # modulo 100 for the last part to keep it 2 digits
    return f"UNK{str(counter).zfill(4)}"

# --- Generate Stations ---
print("Generating station records...")
station_id_counter = 0
all_station_records = []
station_codes_generated = set() # To ensure uniqueness

# Assign cycle parking
# Distribute NUM_STATIONS_WITH_CYCLE_PARKING among portals and some troncal stations
station_ids_for_cycle_parking = [] # Will store (station_id, type) for later spot assignment

# --- 1. Portals ---
portal_records = []
for i in range(NUM_PORTALS):
    station_id_counter += 1
    name = portal_names_pdf[i].replace("'", "''")
    
    code_prefix_num = 0 # Not strictly needed for portal code PXX
    while True:
        code = generate_station_code("PORTAL", i + 1)
        if code not in station_codes_generated:
            station_codes_generated.add(code)
            break
        code_prefix_num +=1 # Should not happen for simple PXX

    # Portals are major hubs, often with more amenities
    has_parking = True # All portals get parking in this model
    station_ids_for_cycle_parking.append({"id": station_id_counter, "type": "PORTAL"})

    record = {
        "station_id": station_id_counter,
        "name": name,
        "station_code": code,
        "station_type": "PORTAL",
        "address": f"Portal {name}".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else "NULL",
        "latitude": round(random.uniform(4.45, 4.75), 6), # Spread out portals
        "longitude": round(random.uniform(-74.18, -74.02), 6),
        "has_cycle_parking": has_parking,
        "cycle_parking_spots": 0, # Will be assigned later
        "is_active": True
    }
    portal_records.append(record)
all_station_records.extend(portal_records)

# --- 2. Cable Stations ---
cable_records = []
for i in range(NUM_CABLE_STATIONS):
    station_id_counter += 1
    name = cable_station_names[i].replace("'", "''")
    
    while True:
        code = generate_station_code("CABLE", i + 1)
        if code not in station_codes_generated:
            station_codes_generated.add(code)
            break
    
    # Some cable stations might have cycle parking
    has_parking = random.random() < 0.5
    if has_parking and len(station_ids_for_cycle_parking) < NUM_STATIONS_WITH_CYCLE_PARKING:
        station_ids_for_cycle_parking.append({"id": station_id_counter, "type": "CABLE"})
    else:
        has_parking = False


    record = {
        "station_id": station_id_counter,
        "name": name,
        "station_code": code,
        "station_type": "CABLE",
        "address": f"Estación Cable {name}".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else "NULL",
        "latitude": round(random.uniform(4.50, 4.60), 6), # Cable cars often in specific hilly zones
        "longitude": round(random.uniform(-74.15, -74.10), 6),
        "has_cycle_parking": has_parking,
        "cycle_parking_spots": 0,
        "is_active": True
    }
    cable_records.append(record)
all_station_records.extend(cable_records)

# --- 3. Other Troncal Stations ---
troncal_other_records = []
available_troncal_names = troncal_station_names_pdf + \
                          [f"{name_base} - {chr(65+i%5)}{i}" for i, name_base in enumerate(generic_troncal_street_names * 2 + generic_troncal_av_names + generic_troncal_landmarks)]
random.shuffle(available_troncal_names)

troncal_code_counter = 0
for i in range(NUM_TRONCAL_STATIONS_OTHER):
    station_id_counter += 1
    troncal_code_counter +=1
    name = available_troncal_names[i % len(available_troncal_names)].replace("'", "''") # Cycle through names
    
    station_type = random.choice(["TRONCAL_SIMPLE", "TRONCAL_INTERMEDIA", "TRONCAL_CABECERA"])
    
    while True:
        code = generate_station_code(station_type, troncal_code_counter)
        if code not in station_codes_generated:
            station_codes_generated.add(code)
            break
        troncal_code_counter +=1 # Increment specific troncal counter to vary code

    has_parking = False
    if len(station_ids_for_cycle_parking) < NUM_STATIONS_WITH_CYCLE_PARKING and random.random() < 0.3: # Lower chance for non-portals
        station_ids_for_cycle_parking.append({"id": station_id_counter, "type": station_type})
        has_parking = True
    
    record = {
        "station_id": station_id_counter,
        "name": name,
        "station_code": code,
        "station_type": station_type,
        "address": f"Estación {name}".replace("'", "''"),
        "location_id": random.randint(1, max_location_id) if max_location_id > 0 else "NULL",
        "latitude": round(random.uniform(4.55, 4.70), 6), # Troncal stations along corridors
        "longitude": round(random.uniform(-74.12, -74.05), 6),
        "has_cycle_parking": has_parking,
        "cycle_parking_spots": 0,
        "is_active": True
    }
    troncal_other_records.append(record)
all_station_records.extend(troncal_other_records)

# Distribute cycle parking spots
if station_ids_for_cycle_parking:
    spots_per_station_avg = TOTAL_CYCLE_PARKING_SPOTS // len(station_ids_for_cycle_parking)
    remaining_spots = TOTAL_CYCLE_PARKING_SPOTS % len(station_ids_for_cycle_parking)
    
    temp_spots_assignment = {}
    for item in station_ids_for_cycle_parking:
        base_spots = spots_per_station_avg
        if item["type"] == "PORTAL": # Portals get more
            base_spots = int(spots_per_station_avg * random.uniform(1.2, 2.0))
        elif item["type"] == "TRONCAL_INTERMEDIA" or item["type"] == "TRONCAL_CABECERA":
            base_spots = int(spots_per_station_avg * random.uniform(0.8, 1.2))
        else: # Cable, Troncal_Simple
             base_spots = int(spots_per_station_avg * random.uniform(0.5, 0.8))
        temp_spots_assignment[item["id"]] = max(10, base_spots) # Minimum 10 spots if it has parking

    # Normalize to match TOTAL_CYCLE_PARKING_SPOTS
    current_assigned_total = sum(temp_spots_assignment.values())
    if current_assigned_total > 0:
        factor = TOTAL_CYCLE_PARKING_SPOTS / current_assigned_total
        final_spots_assignment = {}
        normalized_total = 0
        for station_id_pk, spots in temp_spots_assignment.items():
            assigned = int(spots * factor)
            final_spots_assignment[station_id_pk] = assigned
            normalized_total += assigned
        
        # Distribute any rounding difference to the first few stations
        diff = TOTAL_CYCLE_PARKING_SPOTS - normalized_total
        for station_id_pk in final_spots_assignment.keys():
            if diff == 0: break
            final_spots_assignment[station_id_pk] += 1
            diff -=1
            if diff == 0: break

        for station_record in all_station_records:
            if station_record["station_id"] in final_spots_assignment:
                station_record["cycle_parking_spots"] = final_spots_assignment[station_record["station_id"]]
            # Ensure has_cycle_parking is True if spots > 0
            if station_record["cycle_parking_spots"] > 0:
                 station_record["has_cycle_parking"] = True
            elif station_record["has_cycle_parking"] and station_record["cycle_parking_spots"] == 0 : # Had parking flag but no spots assigned
                 station_record["has_cycle_parking"] = False


# --- 4. Zonal Paraderos ---
zonal_paradero_records = []
paradero_code_main_counter = 0
paradero_zone_prefixes = [str(random.randint(100, 999)) for _ in range(max_location_id if max_location_id > 0 else 1)] # One prefix per location

for i in range(NUM_ZONAL_PARADEROS):
    station_id_counter += 1
    paradero_code_main_counter +=1

    loc_id = random.randint(1, max_location_id) if max_location_id > 0 else 1
    # Use street name for paradero name for simplicity
    name = f"Paradero {fake_co.street_name()} - {fake_co.street_suffix()}".replace("'", "''")
    
    while True: # Find unique code
        code = generate_station_code("ZONAL_PARADERO", paradero_code_main_counter, zone_prefix=paradero_zone_prefixes[loc_id-1])
        if code not in station_codes_generated:
            station_codes_generated.add(code)
            break
        paradero_code_main_counter += 1 # Ensure next attempt gets a new number part for code

    record = {
        "station_id": station_id_counter,
        "name": name,
        "station_code": code,
        "station_type": "ZONAL_PARADERO",
        "address": f"{name}, Bogotá".replace("'", "''"), # Simplified address
        "location_id": loc_id,
        "latitude": round(random.uniform(4.40, 4.80), 6), # Paraderos are widespread
        "longitude": round(random.uniform(-74.20, -74.00), 6),
        "has_cycle_parking": False, # Generally no cycle parking at paraderos
        "cycle_parking_spots": 0,
        "is_active": True
    }
    zonal_paradero_records.append(record)
all_station_records.extend(zonal_paradero_records)


# --- Write to SQL file ---
with open(stations_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO stations (station_id, name, station_code, station_type, address, location_id, latitude, longitude, has_cycle_parking, cycle_parking_spots, is_active) VALUES\n")
    
    batch_size = 1000
    total_records = len(all_station_records)

    for i, record in enumerate(all_station_records):
        # Ensure location_id is not NULL string if it's meant to be integer or actual NULL
        loc_id_val = record['location_id'] if isinstance(record['location_id'], int) else "NULL"

        file.write(f"({record['station_id']}, '{record['name']}', '{record['station_code']}', '{record['station_type']}', '{record['address']}', "
                   f"{loc_id_val}, {record['latitude']}, {record['longitude']}, {record['has_cycle_parking']}, "
                   f"{record['cycle_parking_spots']}, {record['is_active']})")

        if (i + 1) % batch_size == 0 and i < total_records - 1:
            file.write(";\n")
            file.write("INSERT INTO stations (station_id, name, station_code, station_type, address, location_id, latitude, longitude, has_cycle_parking, cycle_parking_spots, is_active) VALUES\n")
        elif i < total_records - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for stations generated: {os.path.abspath(stations_output_file)}")
print(f"Total stations generated: {station_id_counter}")

Generating station records...
SQL script for stations generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/8_insert_stations.sql
Total stations generated: 7765


In [7]:
import faker
import random
import os
from datetime import datetime, timedelta

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts"
vehicles_drivers_output_file = os.path.join(output_folder, "9_insert_vehicles_and_drivers.sql")

os.makedirs(output_folder, exist_ok=True)

# Max IDs from previous scripts
# From 6_insert_concessionaires.sql (assuming 27 concessionaires were generated)
max_concessionaire_id = 27
# From 7_insert_depots.sql (assuming 58 depots were generated)
max_depot_id = 58


# Vehicle specifications from PDF (Page 10)
vehicle_specs = [
    {"type": "ALIMENTADOR_50", "capacity": 50, "count": 86, "component": "ALIMENTACION"},
    {"type": "ALIMENTADOR_80", "capacity": 80, "count": 862, "component": "ALIMENTACION"},
    {"type": "ARTICULADO", "capacity": 160, "count": 602, "component": "TRONCAL"}, # Typical capacity
    {"type": "BIARTICULADO", "capacity": 250, "count": 1317, "component": "TRONCAL"}, # Typical capacity
    {"type": "PADRON_DUAL", "capacity": 80, "count": 272, "component": "TRONCAL"}, # Operates on Troncal/Zonal
    {"type": "BUS_19", "capacity": 19, "count": 5, "component": "ZONAL"},
    {"type": "BUS_40", "capacity": 40, "count": 611, "component": "ZONAL"},
    {"type": "BUS_50", "capacity": 50, "count": 3511, "component": "ZONAL"},
    {"type": "BUS_80", "capacity": 80, "count": 3297, "component": "ZONAL"},
]
total_vehicles_from_spec = sum(spec["count"] for spec in vehicle_specs) # Should be 10563

technology_counts = {
    "ELECTRICO": 1486,
    "GNV": 2144,
    "HIBRIDO": 348,
    "DIESEL_EURO_VI": 2382,
    "DIESEL_EURO_V": 4162,
}
total_tech_vehicles = sum(technology_counts.values()) # 10522
# Distribute the remainder (10563 - 10522 = 41) to Diesel Euro V as it's the largest diesel category
technology_counts["DIESEL_EURO_V"] += (total_vehicles_from_spec - total_tech_vehicles)


# Model year distribution from PDF (Page 10 "FLOTA POR MODELO") - approximate
model_year_pdf_counts = {
    2023: 336, 2022: 1007, 2021: 1799, 2020: 1624, 2019: 313,
    2017: 63, 2016: 160, 2015: 1264, 2014: 333, 2013: 335,
    2012: 292, 2011: 142, 2010: 234, 2009: 53, 2008: 15
}
total_vehicles_in_year_dist = sum(model_year_pdf_counts.values()) # approx 7670

# Driver counts from PDF (Page 9)
driver_counts_component = {
    "TRONCAL": 5003,
    "ZONAL_UCE": 16544, # For Zonal UCE component
    "ALIMENTACION": 2899 # For Zonal Alimentacion component
}
total_drivers_to_generate = sum(driver_counts_component.values()) # 24446

# Helper to get concessionaire IDs by operational type (mocked, real script would query DB or use generated list)
# This needs to be based on the actual 'concessionaire_data' from script 6.
# For now, I'll create a placeholder list based on the structure of that data
# Placeholder for concessionaire types - this should align with output of 6_insert_concessionaires.sql
# In a real scenario, you'd fetch this or have access to the list of concessionaires and their types.
# Let's assume we have access to `concessionaire_ids_generated` and their types from script 6.
# For this script, I'll make a simplified assignment logic.
# We know concessionaire IDs are 1 to max_concessionaire_id (27)
# We'll need to map these to their operational capabilities.
# This is a simplification:
concessionaires_by_type = {
    "TRONCAL": [cid for cid in range(1, 11)], # First 10
    "ZONAL_UCE": [cid for cid in range(11, 25)], # Next 14 for Zonal
    "ALIMENTACION": [cid for cid in range(18, 28)], # Some overlap, some specific
    "CABLE": [25] # Assume one specific ID for Cable
}
# A more robust way would be to use the flags (operates_troncal etc.) from the concessionaire generation.
# Let's assume:
# IDs 1-10: Troncal focused (some may also do Zonal)
# IDs 11-26: Zonal/Alimentacion focused
# ID 27: Cable focused
# This is a broad simplification for assigning vehicles/drivers.

# --- Generate Vehicles ---
print(f"Generating {total_vehicles_from_spec} vehicle records...")
all_vehicle_records = []
vehicle_id_counter = 0
assigned_tech_counts = {tech: 0 for tech in technology_counts}
assigned_year_counts = {year: 0 for year in model_year_pdf_counts}

# Create a list of all model years based on PDF counts
model_years_list = []
for year, count in model_year_pdf_counts.items():
    model_years_list.extend([year] * count)

# For the remaining vehicles, assign model years from a reasonable range (e.g., 2010-2018)
remaining_vehicles_for_year_assignment = total_vehicles_from_spec - len(model_years_list)
if remaining_vehicles_for_year_assignment > 0:
    model_years_list.extend(random.choices(range(2010, 2019), k=remaining_vehicles_for_year_assignment))
random.shuffle(model_years_list)

# Create a list of all technologies based on counts
technologies_list = []
for tech, count in technology_counts.items():
    technologies_list.extend([tech] * count)
random.shuffle(technologies_list)


for spec in vehicle_specs:
    for _ in range(spec["count"]):
        vehicle_id_counter += 1
        
        license_plate = fake_co.unique.license_plate()
        vehicle_type_val = spec["type"]
        capacity_val = spec["capacity"]
        
        # Assign technology
        tech_val = "DIESEL_EURO_V" # Default
        if technologies_list:
            tech_val = technologies_list.pop()
        
        # Assign model year
        model_year_val = random.randint(2008, 2023) # Default
        if model_years_list:
            model_year_val = model_years_list.pop()

        # Assign concessionaire_id based on vehicle component
        con_id_val = None
        if spec["component"] == "TRONCAL": # Includes PADRON_DUAL for troncal assignment here
            # Assign to concessionaires that operate troncal
            # Simplified: pick from first few concessionaires assumed to be troncal
             con_id_val = random.choice([cid for cid in range(1,11)] + [cid for cid in range(1, max_concessionaire_id + 1) if random.random() < 0.1]) # Some randomness
        elif spec["component"] == "ZONAL" or spec["component"] == "ALIMENTACION":
            # Assign to concessionaires that operate zonal/alimentacion
            # Simplified: pick from later concessionaires assumed to be zonal/feeder
            con_id_val = random.choice([cid for cid in range(11, max_concessionaire_id)] + [cid for cid in range(1, max_concessionaire_id + 1) if random.random() < 0.1])
        
        if con_id_val is None or con_id_val > max_concessionaire_id: # Fallback
            con_id_val = random.randint(1,max_concessionaire_id)

        status_val = "active" if random.random() < 0.95 else random.choice(["maintenance", "inactive"])
        current_depot_id_val = random.randint(1, max_depot_id) if max_depot_id > 0 else "NULL"

        all_vehicle_records.append(
            f"({vehicle_id_counter}, '{license_plate}', '{vehicle_type_val}', {capacity_val}, '{tech_val}', "
            f"{model_year_val}, {con_id_val}, '{status_val}', {current_depot_id_val})"
        )

# --- Generate Drivers ---
print(f"Generating {total_drivers_to_generate} driver records...")
all_driver_records = []
driver_id_counter = 0

driver_component_assignment_list = []
for component, count in driver_counts_component.items():
    driver_component_assignment_list.extend([component] * count)
random.shuffle(driver_component_assignment_list)

for i in range(total_drivers_to_generate):
    driver_id_counter += 1
    
    employee_id_val = fake_co.unique.ssn().replace('-', '') # Using SSN format as unique employee ID
    first_name_val = fake_co.first_name().replace("'", "''")
    last_name_val = fake_co.last_name().replace("'", "''")
    
    # Assign concessionaire based on driver's component
    component_type = driver_component_assignment_list[i]
    con_id_val = None
    if component_type == "TRONCAL":
        con_id_val = random.choice([cid for cid in range(1,11)] + [cid for cid in range(1, max_concessionaire_id + 1) if random.random() < 0.1])
    elif component_type == "ZONAL_UCE" or component_type == "ALIMENTACION":
         con_id_val = random.choice([cid for cid in range(11, max_concessionaire_id)] + [cid for cid in range(1, max_concessionaire_id + 1) if random.random() < 0.1])

    if con_id_val is None or con_id_val > max_concessionaire_id: # Fallback
        con_id_val = random.randint(1,max_concessionaire_id)

    hire_date_obj = fake_co.date_between(start_date='-15y', end_date='-1m') # Hired at least 1 month ago
    hire_date_val = hire_date_obj.strftime('%Y-%m-%d')
    
    license_number_val = f"{chr(random.randint(65,90))}{chr(random.randint(65,90))}{random.randint(1000,9999)}" # Simple license format
    
    license_expiry_date_obj = fake_co.date_between(start_date='today', end_date='+5y')
    license_expiry_date_val = license_expiry_date_obj.strftime('%Y-%m-%d')
    
    status_val = "active" if random.random() < 0.9 else random.choice(["on_leave", "inactive"])

    all_driver_records.append(
        f"({driver_id_counter}, '{employee_id_val}', '{first_name_val}', '{last_name_val}', {con_id_val}, "
        f"'{hire_date_val}', '{license_number_val}', '{license_expiry_date_val}', '{status_val}')"
    )

# --- Write to SQL file ---
with open(vehicles_drivers_output_file, 'w', encoding='utf-8') as file:
    batch_size = 1000

    # Write Vehicles
    file.write("INSERT INTO vehicles (vehicle_id, license_plate, vehicle_type, capacity, technology, model_year, concessionaire_id, status, current_depot_id) VALUES\n")
    for i, record_string in enumerate(all_vehicle_records):
        file.write(record_string)
        if (i + 1) % batch_size == 0 and i < len(all_vehicle_records) - 1:
            file.write(";\n")
            file.write("INSERT INTO vehicles (vehicle_id, license_plate, vehicle_type, capacity, technology, model_year, concessionaire_id, status, current_depot_id) VALUES\n")
        elif i < len(all_vehicle_records) - 1:
            file.write(",\n")
        else:
            file.write(";\n")
    
    file.write("\n\n") # Separator

    # Write Drivers
    file.write("INSERT INTO drivers (driver_id, employee_id, first_name, last_name, concessionaire_id, hire_date, license_number, license_expiry_date, status) VALUES\n")
    for i, record_string in enumerate(all_driver_records):
        file.write(record_string)
        if (i + 1) % batch_size == 0 and i < len(all_driver_records) - 1:
            file.write(";\n")
            file.write("INSERT INTO drivers (driver_id, employee_id, first_name, last_name, concessionaire_id, hire_date, license_number, license_expiry_date, status) VALUES\n")
        elif i < len(all_driver_records) - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for vehicles and drivers generated: {os.path.abspath(vehicles_drivers_output_file)}")

Generating 10563 vehicle records...
Generating 24446 driver records...
SQL script for vehicles and drivers generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/9_insert_vehicles_and_drivers.sql


In [8]:
import faker
import random
import os
from datetime import datetime

# Initialize Faker for Colombian Spanish
fake_co = faker.Faker('es_CO')

# --- Configuration ---
output_folder = "generated_sql_scripts"
routes_sequences_output_file = os.path.join(output_folder, "10_insert_routes_and_sequences.sql")

os.makedirs(output_folder, exist_ok=True)

# --- Counts from PDF ---
NUM_TRONCAL_ROUTES = 99
NUM_ZONAL_UCE_ROUTES = 347
NUM_ALIMENTADORA_ROUTES = 106
NUM_CABLE_ROUTES = 1 # Main cable line
DUAL_ROUTE_CODES = ["DM81", "MK86", "ML82", "MC84", "M83"] # Page 18
NUM_DUAL_ROUTES = len(DUAL_ROUTE_CODES)

# Max IDs from previous scripts (essential for FK integrity)
# From 6_insert_concessionaires.sql
max_concessionaire_id = 27
# From 8_insert_stations.sql
# station_id_counter was ~7765. Ranges:
PORTAL_IDS = list(range(1, NUM_PORTALS + 1)) # 1-9
CABLE_STATION_IDS = list(range(NUM_PORTALS + 1, NUM_PORTALS + NUM_CABLE_STATIONS + 1)) # 10-13
TRONCAL_STATION_IDS_OTHER = list(range(NUM_PORTALS + NUM_CABLE_STATIONS + 1, TOTAL_STATIONS_INC_CABLE + 1)) # 14-142
ZONAL_PARADERO_IDS = list(range(TOTAL_STATIONS_INC_CABLE + 1, 7765 + 1)) # Approx 143 - 7765 (adjust if exact final count from script 8 differs)
ALL_TRONCAL_STATIONS = PORTAL_IDS + TRONCAL_STATION_IDS_OTHER # All stations usable by troncal routes

# Mapping concessionaires to types (simplified from script 6 logic)
# This needs to be robust. Re-establish based on how concessionaires were defined.
# For this script, we'll create a list of concessionaires for each type.
# Concessionaire data from script 6:
concessionaire_definitions = [
    {"id": 1, "name": "Bogotá Móvil Operación Sur BMO SUR S.A.S", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 2, "name": "Connexion Móvil S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 3, "name": "Capitalbus S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 4, "name": "SI18 Calle 80 S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 5, "name": "SI18 Norte S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 6, "name": "SI18 Suba S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 7, "name": "Somos Bogotá Usme S.A.S.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 8, "name": "Gmovil S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 9, "name": "Consorcio Express S.A.S.", "troncal": True, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 10, "name": "Este Es Mi Bus S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 11, "name": "ETIB S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 12, "name": "Masivo Capital S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 13, "name": "Organización Suma S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 14, "name": "E-Somos Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 15, "name": "Mueve Fontibón S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 16, "name": "ZMO Fontibón III S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 17, "name": "ZMO Fontíbón V S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 18, "name": "Emasivo 10 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 19, "name": "Emasivo 16 S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 20, "name": "Operadora Distrital de Transporte La Rolita", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 21, "name": "E-Somos Alimentación S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": True, "cable": False},
    {"id": 22, "name": "Gran Américas Usme S.A.S", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 23, "name": "Mueve Usme S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
    {"id": 24, "name": "Cable Movil de Bogota S.A.S.", "troncal": False, "zonal_uce": False, "zonal_alimentacion": False, "cable": True}, # Cable operator
    {"id": 25, "name": "Transportes Urbanos Integrados S.A.S.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": True, "cable": False},
    {"id": 26, "name": "Movilidad Estratégica del Oriente S.A.", "troncal": True, "zonal_uce": False, "zonal_alimentacion": False, "cable": False},
    {"id": 27, "name": "Conexión Capital S.P.A.", "troncal": False, "zonal_uce": True, "zonal_alimentacion": False, "cable": False},
]

concessionaires_troncal_ids = [c["id"] for c in concessionaire_definitions if c["troncal"]]
concessionaires_zonal_uce_ids = [c["id"] for c in concessionaire_definitions if c["zonal_uce"]]
concessionaires_alimentacion_ids = [c["id"] for c in concessionaire_definitions if c["zonal_alimentacion"]]
concessionaires_cable_ids = [c["id"] for c in concessionaire_definitions if c["cable"]]

# Ensure lists are not empty
if not concessionaires_troncal_ids: concessionaires_troncal_ids = [1]
if not concessionaires_zonal_uce_ids: concessionaires_zonal_uce_ids = [14]
if not concessionaires_alimentacion_ids: concessionaires_alimentacion_ids = [10]
if not concessionaires_cable_ids: concessionaires_cable_ids = [24]


# Zonal route codes from PDF p23
zonal_route_codes_pdf = ["T11", "T13", "BH907", "330", "T25", "CG147", "94", "SE14", "614", "SE6"]


# --- Helper Functions ---
def generate_route_code(route_type, counter, existing_codes):
    code = ""
    attempts = 0
    while attempts < 100: # Max attempts to find a unique code
        if route_type == "TRONCAL":
            # B1, C17, H20, K43, L82, M51, F23, G45, J70, E32
            prefix = random.choice(["B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M"])
            num = random.randint(1, 99)
            code = f"{prefix}{num}"
        elif route_type == "DUAL": # Will be categorized as TRONCAL type in DB schema
            # Use pre-defined DUAL_ROUTE_CODES first
            if counter < len(DUAL_ROUTE_CODES):
                code = DUAL_ROUTE_CODES[counter]
            else: # Generate if more dual routes are needed than pre-defined
                prefix = random.choice(["DM", "MD", "ML", "LM", "CM"])
                num = random.randint(80, 99)
                code = f"{prefix}{num}"
        elif route_type == "ZONAL_UCE":
            if counter < len(zonal_route_codes_pdf):
                code = zonal_route_codes_pdf[counter]
            elif random.random() < 0.5: # Number based
                code = str(random.randint(1, 999))
                if random.random() < 0.3: # Add a dash sometimes
                    code += f"-{random.randint(1,10)}"
            else: # Letter + Number
                prefix = random.choice(["A", "B", "C", "D", "E", "F", "G", "H", "K", "L", "P", "S", "T", "U", "Z"]) + \
                         random.choice(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]) + \
                         random.choice(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
                code = prefix
                if random.random() < 0.2 : code = "SITP" + code # SITP prefix for some
        elif route_type == "ALIMENTADORA":
            # e.g., 1-1, 10-3, B902 (some alimentadores look like zonal/troncal codes too)
            if random.random() < 0.6:
                prefix = str(random.randint(1,16)) # Based on common portal/zone numbers
                suffix = str(random.randint(1,10))
                code = f"{prefix}-{suffix}"
            else:
                prefix = random.choice(PORTAL_IDS) # Use portal ID as part of the code logic
                code = f"A{prefix}{random.randint(0,9)}"

        elif route_type == "CABLE":
            code = f"TC{counter+1}" # TransMiCable 1

        if code not in existing_codes:
            existing_codes.add(code)
            return code
        counter +=1 # To help vary generation if first try fails
        attempts +=1
    return f"FAIL{route_type}{counter}" # Fallback unique code

# --- Generate Routes and Intermediate Stations ---
print("Generating routes and intermediate station sequences...")
all_route_records = []
all_intermediate_station_records = []
route_id_counter = 0
intermediate_station_id_counter = 0
generated_route_codes = set()

route_definitions = [
    {"type": "TRONCAL", "count": NUM_TRONCAL_ROUTES, "con_ids": concessionaires_troncal_ids, "orig_dest_pool": ALL_TRONCAL_STATIONS, "inter_pool": ALL_TRONCAL_STATIONS, "min_stops": 3, "max_stops": 15},
    {"type": "DUAL", "count": NUM_DUAL_ROUTES, "con_ids": concessionaires_troncal_ids, "orig_dest_pool": PORTAL_IDS + TRONCAL_STATION_IDS_OTHER, "inter_pool": ALL_TRONCAL_STATIONS + ZONAL_PARADERO_IDS, "min_stops": 8, "max_stops": 25}, # Dual routes use both
    {"type": "ZONAL_UCE", "count": NUM_ZONAL_UCE_ROUTES, "con_ids": concessionaires_zonal_uce_ids, "orig_dest_pool": ZONAL_PARADERO_IDS + TRONCAL_STATION_IDS_OTHER, "inter_pool": ZONAL_PARADERO_IDS, "min_stops": 10, "max_stops": 40},
    {"type": "ALIMENTADORA", "count": NUM_ALIMENTADORA_ROUTES, "con_ids": concessionaires_alimentacion_ids, "orig_dest_pool_orig": PORTAL_IDS + TRONCAL_STATION_IDS_OTHER, "orig_dest_pool_dest": ZONAL_PARADERO_IDS, "inter_pool": ZONAL_PARADERO_IDS, "min_stops": 5, "max_stops": 20},
    {"type": "CABLE", "count": NUM_CABLE_ROUTES, "con_ids": concessionaires_cable_ids, "orig_dest_pool": CABLE_STATION_IDS, "inter_pool": CABLE_STATION_IDS, "min_stops": 1, "max_stops": 2} # Cable has fixed intermediate stations
]

for definition in route_definitions:
    route_type_db = definition["type"]
    if route_type_db == "DUAL": # Schema maps DUAL to TRONCAL type for now
        route_type_db = "TRONCAL"

    for i in range(definition["count"]):
        route_id_counter += 1
        
        route_code_val = generate_route_code(definition["type"], i, generated_route_codes)
        
        # Select Origin and Destination
        origin_station_id_val = None
        destination_station_id_val = None

        if definition["type"] == "ALIMENTADORA":
            origin_station_id_val = random.choice(definition["orig_dest_pool_orig"])
            destination_station_id_val = random.choice(definition["orig_dest_pool_dest"])
        elif definition["type"] == "CABLE":
            # Cable route is fixed: Portal Tunal Cable -> Juan Pablo II -> Manitas -> Mirador del Paraiso
            # CABLE_STATION_IDS are 10, 11, 12, 13
            origin_station_id_val = CABLE_STATION_IDS[0] # e.g., Portal Tunal Cable
            destination_station_id_val = CABLE_STATION_IDS[-1] # e.g., Mirador del Paraiso
        else:
            if len(definition["orig_dest_pool"]) >=2:
                origin_station_id_val, destination_station_id_val = random.sample(definition["orig_dest_pool"], 2)
            else: # Fallback for small pools
                origin_station_id_val = random.choice(definition["orig_dest_pool"])
                destination_station_id_val = random.choice(definition["orig_dest_pool"])


        route_name_val = f"{route_code_val}: Est. {origin_station_id_val} - Est. {destination_station_id_val}".replace("'", "''")
        concessionaire_id_val = random.choice(definition["con_ids"])
        is_active_val = True if random.random() < 0.95 else False # Most routes active

        all_route_records.append(
            f"({route_id_counter}, '{route_code_val}', '{route_name_val}', '{route_type_db}', "
            f"{origin_station_id_val}, {destination_station_id_val}, {concessionaire_id_val}, {is_active_val})"
        )

        # Generate Intermediate Stations for this route
        num_intermediate_stops = random.randint(definition["min_stops"], definition["max_stops"])
        
        # Ensure intermediate pool is not empty and has enough unique stations
        current_inter_pool = [s for s in definition["inter_pool"] if s != origin_station_id_val and s != destination_station_id_val]
        
        if not current_inter_pool: # If pool is empty after removing origin/dest, skip intermediate
            continue

        # Special handling for CABLE route intermediate stations
        if definition["type"] == "CABLE" and len(CABLE_STATION_IDS) > 2:
            # Fixed intermediate stations for the main cable line
            # Assuming CABLE_STATION_IDS are [Origin, Inter1, Inter2, ..., Dest]
            # For 4 stations: [0]=Origin, [1]=Inter1, [2]=Inter2, [3]=Dest
            # Intermediate are CABLE_STATION_IDS[1] and CABLE_STATION_IDS[2]
            stops_to_add = CABLE_STATION_IDS[1:-1] # Exclude first (origin) and last (destination)
        else:
            if len(current_inter_pool) < num_intermediate_stops:
                num_intermediate_stops = len(current_inter_pool) # Max available
            stops_to_add = random.sample(current_inter_pool, num_intermediate_stops)

        for seq_order, station_id_val in enumerate(stops_to_add):
            intermediate_station_id_counter += 1
            all_intermediate_station_records.append(
                f"({intermediate_station_id_counter}, {route_id_counter}, {station_id_val}, {seq_order + 1})"
            )


# --- Write to SQL file ---
with open(routes_sequences_output_file, 'w', encoding='utf-8') as file:
    batch_size = 1000

    # Write Routes
    file.write("INSERT INTO routes (route_id, route_code, route_name, route_type, origin_station_id, destination_station_id, concessionaire_id, is_active) VALUES\n")
    for i, record_string in enumerate(all_route_records):
        file.write(record_string)
        if (i + 1) % batch_size == 0 and i < len(all_route_records) - 1:
            file.write(";\n")
            file.write("INSERT INTO routes (route_id, route_code, route_name, route_type, origin_station_id, destination_station_id, concessionaire_id, is_active) VALUES\n")
        elif i < len(all_route_records) - 1:
            file.write(",\n")
        else:
            file.write(";\n")
    
    file.write("\n\n")

    # Write Intermediate Stations
    file.write("INSERT INTO intermediate_stations (intermediate_station_id, route_id, station_id, sequence_order) VALUES\n")
    for i, record_string in enumerate(all_intermediate_station_records):
        file.write(record_string)
        if (i + 1) % batch_size == 0 and i < len(all_intermediate_station_records) - 1:
            file.write(";\n")
            file.write("INSERT INTO intermediate_stations (intermediate_station_id, route_id, station_id, sequence_order) VALUES\n")
        elif i < len(all_intermediate_station_records) - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for routes and intermediate stations generated: {os.path.abspath(routes_sequences_output_file)}")

Generating routes and intermediate station sequences...
SQL script for routes and intermediate stations generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/10_insert_routes_and_sequences.sql


In [9]:
import os
from datetime import date

# --- Configuration ---
output_folder = "generated_sql_scripts"
fares_output_file = os.path.join(output_folder, "11_insert_fares.sql")

os.makedirs(output_folder, exist_ok=True)

# Define fare data
# Current SITP standard fare is 2950 COP.
# Transfers are 0 or 200 COP.
# We'll define these for a period covering June 2024 through a future date.
# Given current date is June 1, 2025, let's make these fares active.

fare_data = [
    {
        "fare_id": 1,
        "fare_type": "STANDARD_SITP", # General standard fare for Troncal/Zonal
        "value": 2950.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'), # Assuming a start date in early 2024
        "end_date": "NULL", # Currently active
        "description": "Tarifa estándar del componente Troncal y Zonal del SITP."
    },
    {
        "fare_id": 2,
        "fare_type": "TRANSFER_0_COST",
        "value": 0.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL", # Currently active
        "description": "Transbordo sin costo adicional (dentro de la ventana de tiempo y condiciones)."
    },
    {
        "fare_id": 3,
        "fare_type": "TRANSFER_200_COST",
        "value": 200.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL", # Currently active
        "description": "Transbordo con costo de $200 COP (dentro de la ventana de tiempo y condiciones)."
    },
    {
        "fare_id": 4,
        "fare_type": "STANDARD_CABLE", # TransMiCable might have the same standard fare
        "value": 2950.00,
        "start_date": date(2024, 1, 15).strftime('%Y-%m-%d'),
        "end_date": "NULL", # Currently active
        "description": "Tarifa estándar para TransMiCable."
    },
    # Example of an older fare for historical data, if needed later
    # {
    #     "fare_id": 5,
    #     "fare_type": "STANDARD_SITP_OLD",
    #     "value": 2650.00,
    #     "start_date": date(2023, 1, 10).strftime('%Y-%m-%d'),
    #     "end_date": date(2024, 1, 14).strftime('%Y-%m-%d'),
    #     "description": "Tarifa estándar antigua del SITP."
    # }
]

# --- Generate Fares SQL ---
print("Generating fare records...")

with open(fares_output_file, 'w', encoding='utf-8') as file:
    file.write("INSERT INTO fares (fare_id, fare_type, value, start_date, end_date, description) VALUES\n")
    
    for i, fare in enumerate(fare_data):
        fare_id_val = fare["fare_id"]
        fare_type_val = fare["fare_type"].replace("'", "''")
        value_val = fare["value"]
        start_date_val = fare["start_date"]
        end_date_val = f"'{fare['end_date']}'" if fare["end_date"] != "NULL" else "NULL"
        description_val = fare["description"].replace("'", "''")
        
        file.write(f"({fare_id_val}, '{fare_type_val}', {value_val}, '{start_date_val}', {end_date_val}, '{description_val}')")
        
        if i < len(fare_data) - 1:
            file.write(",\n")
        else:
            file.write(";\n")

print(f"SQL script for fares generated: {os.path.abspath(fares_output_file)}")

Generating fare records...
SQL script for fares generated: /home/kali/Documents/remote reps/distributed-systems-lab/travel-recharge-database/db/data/generated_sql_scripts/11_insert_fares.sql
