# Trabajo Tesis: Preprocesamiento faenas

### Librerias y bases

In [1]:
import os
import requests

import numpy as np
import pandas as pd

from tqdm import tqdm
from IPython.display import display, HTML

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.distance import geodesic


In [2]:
#os.chdir('c:/Users/artur/OneDrive/Documents/TrabajoTesis') 
os.chdir('C:/Users/admin/OneDrive/Documents/TrabajoTesis')

In [3]:
df = pd.read_excel('Cluster/Faenas25.xlsx')

In [4]:
# Limpiar nombres de columnas
df.columns = ['RutEmpresa' if col == 'RutEmpresa' else col.lstrip() for col in df.columns]

# Limpiar espacios dentro de las columnas (excepto RutEmpresa)
for col in df.select_dtypes(include='object').columns:
    if col != 'RutEmpresa':
        df[col] = df[col].str.strip()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28319 entries, 0 to 28318
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RutEmpresa                  28319 non-null  object 
 1   NombreEmpresa               28319 non-null  object 
 2   RegionFaena                 28319 non-null  object 
 3   ProvinciaFaena              28319 non-null  object 
 4   ComunaFaena                 28319 non-null  object 
 5   NombreFaena                 28318 non-null  object 
 6   CategoriaFaena              28319 non-null  object 
 7   IdFaena                     28319 non-null  int64  
 8   RegionInstalacion           28319 non-null  object 
 9   ProvinciaInstalacion        28319 non-null  object 
 10  ComunaInstalacion           28319 non-null  object 
 11  NombreInstalacion           28319 non-null  object 
 12  IdTipoInstalacion           28319 non-null  int64  
 13  TipoInstalacion             283

### Limpieza

Nos quedamos solo con las faenas de producción de cobre

In [6]:
df = df[df['RecursoPrimarioInstalacion'] == 'COBRE']
df.shape

(21158, 27)

Dropeo de variables de poca relevancia, es decir que nos sea id o de ubicación, o que nos ayude a analizar en corto plazo

In [7]:
df = df.drop(columns=[
     'ProvinciaFaena', 'ComunaFaena', 'NombreFaena', 'IdFaena',
    'RegionInstalacion','ComunaInstalacion','ProvinciaInstalacion', 'NombreInstalacion', 'RecursoMineroInstalacion', 'RecursoPrimarioInstalacion',
    'TipoRecursoInstalacion', 'IdInstalacion', 'Norte', 'Este', 'Datum', 'IdEstado', 'IdTipoInstalacion'
])

In [8]:
df

Unnamed: 0,RutEmpresa,NombreEmpresa,RegionFaena,CategoriaFaena,TipoInstalacion,Cota,Huso,Estado,Latitud,Longitud
0,61703000-4,EMPRESA NACIONAL DE MINERIA (ENAMI),III,CATEGORIA B,TALLERES Y MAESTRANZA,538,19,ACTIVA,-28.566725,-70.740064
1,76125921-0,CUPRUM RESOURCES CHILE SPA,IV,CATEGORIA D,EXPLORACION DE SUPERFICIE,1200,19,INACTIVA,-29.432920,-70.683699
2,76362163-4,MINERA RUPERTO PATRICIO ORTIZ PALOMINOS,VI,CATEGORIA D,MINA SUBTERRANEA,134,19,ABANDONADA,-34.677189,-71.498091
4,6891906-1,GARY NELSON MERCADO MALBRAN,III,CATEGORIA D,MINA SUBTERRANEA,531,19,ACTIVA,-26.411292,-70.414104
12,76448510-6,SLM CASUALIDAD PRIMERA DE EL MANZANO,IV,CATEGORIA D,MINA SUBTERRANEA,917,19,IRREGULAR OPERATIVA,-30.204069,-71.082232
...,...,...,...,...,...,...,...,...,...,...
28311,78131832-9,BALFENI SOLUTIONS SPA,IV,CATEGORIA D,CAMPAMENTO,1277,19,EN REVISIÓN DE PROYECTO,-31.216132,-71.267759
28312,78131832-9,BALFENI SOLUTIONS SPA,IV,CATEGORIA D,MINA SUBTERRANEA,1199,19,EN REVISIÓN DE PROYECTO,-31.220795,-71.265226
28313,6188182-4,EDILIO GOMEZ OYARZUN,IV,CATEGORIA D,MINA SUBTERRANEA,458,19,EN REVISIÓN DE PROYECTO,-31.548972,-71.343902
28314,78110627-5,C & C MINERIA SpA,II,CATEGORIA D,POLVORIN,3250,19,EN REVISIÓN DE PROYECTO,-23.044930,-68.557016


In [9]:
df_sample = df.sample(frac=0.1, random_state=42).copy() 


### Enriquesimiento

In [10]:
# Coordinates of the 15 cities
cities_coords = {
    "Santiago": (-33.4489, -70.6693),
    "Valparaíso": (-33.0472, -71.6127),
    "Concepción": (-36.8201, -73.0444),
    "La Serena": (-29.9037, -71.2489),
    "Coquimbo": (-29.9533, -71.3395),
    "Antofagasta": (-23.6500, -70.4000),
    "Temuco": (-38.7369, -72.5904),
    "Rancagua": (-34.1708, -70.7400),
    "Iquique": (-20.2307, -70.1350),
    "Puerto Montt": (-41.4694, -72.9420),
    "Talca": (-35.4261, -71.6550),
    "Arica": (-18.4783, -70.3126),
    "Chillán": (-36.6066, -72.1034),
    "Los Ángeles": (-37.4600, -72.3500),
    "Copiapó": (-27.3668, -70.3326),
    "Valdivia": (-39.8196, -73.2459)
}

# Function to find nearest city
def nearest_city(lat, lon):
    closest_city = min(cities_coords.keys(), key=lambda city: geodesic((lat, lon), cities_coords[city]).km)
    return closest_city

# Apply to your dataframe
tqdm.pandas()
df["Closest_City"] = df.progress_apply(
    lambda row: nearest_city(row["Latitud"], row["Longitud"]), axis=1
)

print(df.head())

  0%|          | 0/21158 [00:00<?, ?it/s]

100%|██████████| 21158/21158 [00:39<00:00, 538.56it/s]

    RutEmpresa                            NombreEmpresa RegionFaena  \
0   61703000-4      EMPRESA NACIONAL DE MINERIA (ENAMI)         III   
1   76125921-0               CUPRUM RESOURCES CHILE SPA          IV   
2   76362163-4  MINERA RUPERTO PATRICIO ORTIZ PALOMINOS          VI   
4    6891906-1              GARY NELSON MERCADO MALBRAN         III   
12  76448510-6     SLM CASUALIDAD PRIMERA DE EL MANZANO          IV   

   CategoriaFaena            TipoInstalacion  Cota  Huso               Estado  \
0     CATEGORIA B      TALLERES Y MAESTRANZA   538    19               ACTIVA   
1     CATEGORIA D  EXPLORACION DE SUPERFICIE  1200    19             INACTIVA   
2     CATEGORIA D           MINA SUBTERRANEA   134    19           ABANDONADA   
4     CATEGORIA D           MINA SUBTERRANEA   531    19               ACTIVA   
12    CATEGORIA D           MINA SUBTERRANEA   917    19  IRREGULAR OPERATIVA   

      Latitud   Longitud Closest_City  
0  -28.566725 -70.740064      Copiapó  
1  -29




In [11]:

html = "<div style='height:300px; overflow:auto;'>" + df['Closest_City'].value_counts().to_frame().to_html() + "</div>"
display(HTML(html))

Unnamed: 0_level_0,count
Closest_City,Unnamed: 1_level_1
Copiapó,6015
Valparaíso,4017
Coquimbo,3416
Antofagasta,2791
La Serena,2701
Santiago,1049
Iquique,465
Rancagua,374
Arica,313
Talca,14


In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests

# ============================================================
# 2️⃣ Function to query OSRM and get travel time
# ============================================================

def get_travel_time(lat1, lon1, city, max_retries=3):
    """Fetch driving time (in minutes) between (lat1, lon1) and city center using OSRM API."""
    lat2, lon2 = cities_coords.get(city, (None, None))
    if lat2 is None or lon2 is None:
        return None

    url = f"https://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
    for _ in range(max_retries):
        try:
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                data = response.json()
                if data.get("routes"):
                    duration_sec = data["routes"][0]["duration"]
                    return duration_sec / 60  # minutes
        except Exception:
            pass
    return None

# ============================================================
# 3️⃣ Parallel OSRM calls for speed
# ============================================================

def parallel_osrm(df, max_workers=16):
    results = [None] * len(df)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(get_travel_time, row["Latitud"], row["Longitud"], row["Closest_City"]): idx
            for idx, row in df.iterrows()
        }
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching OSRM travel times"):
            idx = futures[future]
            results[idx] = future.result()
    return results

# ============================================================
# 4️⃣ Apply travel time computation
# ============================================================

print("🚚 Calculating travel times (parallel)...")
df["TravelTime_Truck_min"] = parallel_osrm(df)

# ============================================================
# ✅ Final check
# ============================================================

print("\n✅ Example output:")
print(df[["Latitud", "Longitud", "Closest_City", "TravelTime_Truck_min"]].head())


🚚 Calculating travel times (parallel)...


Fetching OSRM travel times:  75%|███████▍  | 15792/21158 [4:07:18<1:24:01,  1.06it/s]


IndexError: list assignment index out of range

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


geolocator = Nominatim(user_agent="truck_routing_demo")
rate_limited_geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.5)

def geocode_city(city):
    try:
        location = rate_limited_geocode(f"{city}, Chile")
        if location:
            return city, (location.latitude, location.longitude)
    except:
        return city, (None, None)

cities = df["Closest_City"].dropna().unique()
city_coords = {}

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(geocode_city, c) for c in cities]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Geocoding city centers"):
        city, coords = future.result()
        city_coords[city] = coords


Geocoding city centers:   0%|          | 0/11 [00:00<?, ?it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Copiapó, Chile',), **{}).
Traceback (most recent call last):
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connection.py", line 198, in _new_conn
    sock = connection.create_connection(
        (self._dns_host, self.port),
    ...<2 lines>...
        socket_options=self.socket_options,
    )
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
    ~~~~~~~~~~~~^^^^
OSError: [WinError 10051] A socket operation was attempted to an unreachable network

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File 

TypeError: cannot unpack non-iterable NoneType object

In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_travel_time(lat1, lon1, city):
    lat2, lon2 = city_coords.get(city, (None, None))
    if lat2 is None or lon2 is None:
        return None
    url = f"https://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
    try:
        data = requests.get(url, timeout=5).json()
        if data.get("routes"):
            return data["routes"][0]["duration"] / 60  # minutes
    except:
        return None

def parallel_osrm(df):
    results = [None] * len(df)
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = {executor.submit(get_travel_time, row["Latitud"], row["Longitud"], row["Closest_City"]): idx
                   for idx, row in df.iterrows()}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching OSRM travel times"):
            idx = futures[future]
            results[idx] = future.result()
    return results

df["TravelTime_Truck_min"] = parallel_osrm(df)


In [None]:
df.to_csv("dfciudades.csv", index=False)

In [None]:


# Get city coordinates (once per unique city)
geolocator = Nominatim(user_agent="truck_routing_demo")
reverse = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_city_coords(city_name):
    try:
        location = geolocator.geocode(city_name + ", Chile")  # add country context
        if location:
            return location.latitude, location.longitude
    except:
        return None, None
    return None, None

# Create a mapping of city -> (lat, lon)
cities = df["Closest_City"].dropna().unique()
city_coords = {}
for c in tqdm(cities, desc="Geocoding city centers"):
    lat, lon = get_city_coords(c)
    city_coords[c] = (lat, lon)

# Function to query OSRM
def get_travel_time(lat1, lon1, city):
    lat2, lon2 = city_coords.get(city, (None, None))
    if lat2 is None or lon2 is None:
        return None
    url = f"https://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
    try:
        response = requests.get(url)
        data = response.json()
        if data.get("routes"):
            duration_sec = data["routes"][0]["duration"]
            return duration_sec / 60  # convert to minutes
    except Exception:
        return None
    return None

# Apply to DataFrame
tqdm.pandas()
df["TravelTime_Truck_min"] = df.progress_apply(
    lambda row: get_travel_time(row["Latitud"], row["Longitud"], row["Closest_City"]),
    axis=1
)

# Done!
print(df[["Latitud", "Longitud", "Closest_City", "TravelTime_Truck_min"]].head())


Geocoding city centers: 100%|██████████| 11/11 [00:11<00:00,  1.09s/it]
  9%|▉         | 1901/21158 [34:35<5:50:21,  1.09s/it]  


KeyboardInterrupt: 

In [None]:
df = pd.read_csv("dataSamples/df_sample.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2116 entries, 0 to 2115
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RutEmpresa                  2116 non-null   object 
 1   NombreEmpresa               2116 non-null   object 
 2   CategoriaFaena              2116 non-null   object 
 3   ProvinciaInstalacion        2116 non-null   object 
 4   ComunaInstalacion           2116 non-null   object 
 5   IdTipoInstalacion           2116 non-null   int64  
 6   TipoInstalacion             2116 non-null   object 
 7   RecursoPrimarioInstalacion  2116 non-null   object 
 8   Cota                        2116 non-null   int64  
 9   Huso                        2116 non-null   int64  
 10  IdEstado                    2116 non-null   int64  
 11  Estado                      2116 non-null   object 
 12  Latitud                     2116 non-null   float64
 13  Longitud                    2116 

In [None]:
desalinadoras = pd.read_excel("Bases/DESALINADORAS.xlsx")

In [None]:
desalinadoras.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Latitud   43 non-null     object
 1   Longitud  43 non-null     object
dtypes: object(2)
memory usage: 820.0+ bytes


In [None]:
estaciones = pd.read_csv("Bases/Estaciones.csv")

In [None]:
estaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        292 non-null    object 
 1   railway     292 non-null    object 
 2   usage       292 non-null    object 
 3   industrial  292 non-null    object 
 4   lat         292 non-null    float64
 5   lon         292 non-null    float64
dtypes: float64(2), object(4)
memory usage: 13.8+ KB


In [None]:
puertos = pd.read_excel("Bases/PuertosChile.xlsx")

In [None]:
puertos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitud    14 non-null     float64
 1   Longitud   14 non-null     float64
 2   Localidad  14 non-null     object 
dtypes: float64(2), object(1)
memory usage: 468.0+ bytes


In [None]:
subestaciones = pd.read_csv("Bases/Subestaciones.csv")

In [None]:
centrales_combinadas = pd.read_csv("Bases/centrales_combinadas.csv")

In [None]:
lineas_transmision = pd.read_csv("Bases/lineas_transmision.csv")

In [None]:
def clean_coords(df, cols):
    for col in cols:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(',', '.', regex=False)
            .str.replace(r'[^0-9\.\-]', '', regex=True)
            .replace('', np.nan)  # empty strings → NaN
            .astype(float)
        )
    return df

def drop_missing_coords(df, lat_col, lon_col):
    before = len(df)
    df = df.dropna(subset=[lat_col, lon_col])
    after = len(df)
    if before != after:
        print(f"⚠️ Dropped {before - after} rows without valid coordinates.")
    return df


df = clean_coords(df, ['Latitud', 'Longitud'])
desalinadoras = clean_coords(desalinadoras, ['Latitud', 'Longitud'])
estaciones = clean_coords(estaciones, ['lat', 'lon'])
puertos = clean_coords(puertos, ['Latitud', 'Longitud'])
subestaciones = clean_coords(subestaciones, ['Latitude', 'Longitude'])
centrales_combinadas = clean_coords(centrales_combinadas, ['Latitude', 'Longitude'])
lineas_transmision = clean_coords(lineas_transmision, ['Latitude', 'Longitude'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import requests
from tqdm import tqdm


# Convert to string first, then replace commas, then to float
df['Latitud'] = df['Latitud'].astype(str).str.replace(',', '.', regex=False).astype(float)
df['Longitud'] = df['Longitud'].astype(str).str.replace(',', '.', regex=False).astype(float)


# Utility Functions


def to_radians(df, lat_col, lon_col):
    """Convert coordinates to radians for Haversine distance."""
    return np.radians(df[[lat_col, lon_col]].values)

def closest_point(main_df, ref_df, main_lat, main_lon, ref_lat, ref_lon, ref_name_col, prefix):
    """Find the closest reference point for each row using BallTree (Haversine distance)."""
    # build coords arrays (drop NA rows should have been done earlier)
    main_coords = to_radians(main_df[[main_lat, main_lon]].values)
    ref_coords = to_radians(ref_df[[ref_lat, ref_lon]].values)

    tree = BallTree(ref_coords, metric='haversine')
    dist, idx = tree.query(main_coords, k=1)
    dist_km = dist[:, 0] * 6371  # Earth radius -> km

    # safe extracting
    chosen = ref_df.reset_index(drop=True).iloc[idx[:, 0]].reset_index(drop=True)
    main_df[f"{prefix}_cercana"] = chosen[ref_name_col].values if ref_name_col and ref_name_col in ref_df.columns else idx[:, 0]
    main_df[f"Lat_{prefix}_cercana"] = chosen[ref_lat].values
    main_df[f"Lon_{prefix}_cercana"] = chosen[ref_lon].values
    main_df[f"Dist_{prefix}_km"] = dist_km
    return main_df


@lru_cache(maxsize=10000)
def _osrm_get(lon1, lat1, lon2, lat2):
    """Low-level cached OSRM call. Arguments must be primitive types (floats)."""
    try:
        url = f"http://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
        r = requests.get(url, timeout=10)
        data = r.json()
        if "routes" in data and data["routes"]:
            return data["routes"][0]["duration"] / 60.0  # minutes
    except Exception:
        pass
    return float("nan")

def osrm_driving_time(lat1, lon1, lat2, lon2):
    """Wrapper that orders args for the cached call and handles NaNs."""
    # If any input is NaN -> return NaN
    if any(pd.isna(x) for x in [lat1, lon1, lat2, lon2]):
        return float("nan")
    # convert to plain python floats (required for lru_cache key stability)
    return _osrm_get(float(lon1), float(lat1), float(lon2), float(lat2))

def compute_osrm_times(df, prefix, max_workers=8):
    """Compute OSRM travel times for each row with ThreadPoolExecutor and tqdm."""
    # Create a list of args
    args = []
    for _, row in df.iterrows():
        args.append((row["Latitud"], row["Longitud"],
                     row[f"Lat_{prefix}_cercana"], row[f"Lon_{prefix}_cercana"]))
    times = []
    # Parallel requests
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(osrm_driving_time, *a) for a in args]
        for f in tqdm(futures, total=len(futures), desc=f"🚚 Calculando tiempos hacia {prefix}"):
            try:
                times.append(f.result())
            except Exception:
                times.append(float("nan"))
    df[f"DriveTime_{prefix}_min"] = times
    return df

# Pipeline

def compute_accessibility_pipeline(df,
                                   desalinadoras,
                                   estaciones,
                                   puertos,
                                   subestaciones=None,
                                   centrales_combinadas=None,
                                   lineas_transmision=None,
                                   osrm_workers=8):
    """
    End-to-end pipeline:
     - clean coords (accepts different column names)
     - drop missing coords
     - compute nearest for multiple reference datasets
     - compute OSRM driving times (parallel + cached)
    """
    print("✅ Starting nearest-point and travel-time pipeline...")

    # --- CLEAN: adapt to different column names used in your datasets ---
    df = clean_coords(df, ['Latitud', 'Longitud'])
    desalinadoras = clean_coords(desalinadoras, ['Latitud', 'Longitud'])
    estaciones = clean_coords(estaciones, ['lat', 'lon'])
    puertos = clean_coords(puertos, ['Latitud', 'Longitud'])
    # these were the ones you pointed out with English columns
    if subestaciones is not None:
        subestaciones = clean_coords(subestaciones, ['Latitude', 'Longitude'])
    if centrales_combinadas is not None:
        centrales_combinadas = clean_coords(centrales_combinadas, ['Latitude', 'Longitude'])
    if lineas_transmision is not None:
        lineas_transmision = clean_coords(lineas_transmision, ['Latitude', 'Longitude'])

    # Normalize column names for estaciones to lowercase lat/lon -> make them consistent
    # estaciones used 'lat','lon' already; ensure column names exist
    for df_ref, latc, lonc in [
        (df, 'Latitud', 'Longitud'),
        (desalinadoras, 'Latitud', 'Longitud'),
        (estaciones, 'lat', 'lon'),
        (puertos, 'Latitud', 'Longitud'),
    ]:
        # drop missing coords and inform
        if df_ref is not None:
            df_ref = drop_missing_coords(df_ref, latc, lonc)
    # Reassign cleaned/dropped references back (some were mutated locally)
    # (If you want to keep originals, copy before)
    desalinadoras = drop_missing_coords(desalinadoras, 'Latitud', 'Longitud')
    estaciones = drop_missing_coords(estaciones, 'lat', 'lon')
    puertos = drop_missing_coords(puertos, 'Latitud', 'Longitud')
    df = drop_missing_coords(df, 'Latitud', 'Longitud')

    # Also clean/drop English-named refs if provided
    if subestaciones is not None:
        subestaciones = drop_missing_coords(subestaciones, 'Latitude', 'Longitude')
    if centrales_combinadas is not None:
        centrales_combinadas = drop_missing_coords(centrales_combinadas, 'Latitude', 'Longitude')
    if lineas_transmision is not None:
        lineas_transmision = drop_missing_coords(lineas_transmision, 'Latitude', 'Longitude')

    # --- NEAREST POINTS (BallTree) ---
    print("\n📍 Finding nearest reference points...")
    # Note: for 'estaciones' we used ref_name_col='name' in your original code; ensure column exists
    estaciones_name_col = 'name' if 'name' in estaciones.columns else None

    df = closest_point(df, desalinadoras, "Latitud", "Longitud", "Latitud", "Longitud",
                       ref_name_col=None, prefix="Desalinadora")
    df = closest_point(df, estaciones, "Latitud", "Longitud", "lat", "lon",
                       ref_name_col=estaciones_name_col, prefix="Estacion")
    df = closest_point(df, puertos, "Latitud", "Longitud", "Latitud", "Longitud",
                       ref_name_col='Localidad' if 'Localidad' in puertos.columns else None, prefix="Puerto")

    # Optionally add nearest for subestaciones / centrales / lineas if provided
    if subestaciones is not None:
        df = closest_point(df, subestaciones, "Latitud", "Longitud", "Latitude", "Longitude",
                           ref_name_col='Nombre' if 'Nombre' in subestaciones.columns else None, prefix="Subestacion")
    if centrales_combinadas is not None:
        df = closest_point(df, centrales_combinadas, "Latitud", "Longitud", "Latitude", "Longitude",
                           ref_name_col='Nombre' if 'Nombre' in centrales_combinadas.columns else None, prefix="Central")
    if lineas_transmision is not None:
        df = closest_point(df, lineas_transmision, "Latitud", "Longitud", "Latitude", "Longitude",
                           ref_name_col='Nombre' if 'Nombre' in lineas_transmision.columns else None, prefix="LineaTransm")

    # --- OSRM travel times ---
    print("\n🛣️  Calculating driving times (this may take a while)...")
    df = compute_osrm_times(df, "Desalinadora", max_workers=osrm_workers)
    df = compute_osrm_times(df, "Estacion", max_workers=osrm_workers)
    df = compute_osrm_times(df, "Puerto", max_workers=osrm_workers)

    if subestaciones is not None:
        df = compute_osrm_times(df, "Subestacion", max_workers=osrm_workers)
    if centrales_combinadas is not None:
        df = compute_osrm_times(df, "Central", max_workers=osrm_workers)
    if lineas_transmision is not None:
        df = compute_osrm_times(df, "LineaTransm", max_workers=osrm_workers)

    # --- Summary ---
    print("\n🎯 Pipeline complete! Example columns added (check df.columns):")
    expected = [
        "Desalinadora_cercana", "Dist_Desalinadora_km", "DriveTime_Desalinadora_min",
        "Estacion_cercana", "Dist_Estacion_km", "DriveTime_Estacion_min",
        "Puerto_cercana", "Dist_Puerto_km", "DriveTime_Puerto_min"
    ]
    print(expected)
    return df




✅ Starting nearest-point and travel-time pipeline...

📍 Finding nearest reference points...

🛣️  Calculating driving times (this may take a while)...


🚚 Calculando tiempos hacia Desalinadora:   0%|          | 0/2116 [00:00<?, ?it/s]

🚚 Calculando tiempos hacia Desalinadora: 100%|██████████| 2116/2116 [35:16<00:00,  1.00s/it]
🚚 Calculando tiempos hacia Estacion: 100%|██████████| 2116/2116 [35:16<00:00,  1.00s/it]
🚚 Calculando tiempos hacia Puerto: 100%|██████████| 2116/2116 [35:16<00:00,  1.00s/it]


🎯 Pipeline complete! Columns added:
['Desalinadora_cercana', 'Dist_Desalinadora_km', 'DriveTime_Desalinadora_min', 'Estacion_cercana', 'Dist_Estacion_km', 'DriveTime_Estacion_min', 'Puerto_cercano', 'Dist_Puerto_km', 'DriveTime_Puerto_min']





In [None]:
df = clean_coords(df, ['Latitud', 'Longitud'])
desalinadoras = clean_coords(desalinadoras, ['Latitud', 'Longitud'])
estaciones = clean_coords(estaciones, ['lat', 'lon'])
puertos = clean_coords(puertos, ['Latitud', 'Longitud'])
subestaciones = clean_coords(subestaciones, ['Latitude', 'Longitude'])
centrales_combinadas = clean_coords(centrales_combinadas, ['Latitude', 'Longitude'])
lineas_transmision = clean_coords(lineas_transmision, ['Latitude', 'Longitude'])

df_enriched = compute_accessibility_pipeline(
    df,
    desalinadoras,
    estaciones,
    puertos,
    subestaciones=subestaciones,
    centrales_combinadas=centrales_combinadas,
    lineas_transmision=lineas_transmision,
    osrm_workers=12
)


In [None]:
#df_enriched.to_csv("faenas_with_accessibility.csv", index=False)