# Trabajo Tesis: Preprocesamiento faenas

### Librerias y bases

In [23]:
import os
import requests

import numpy as np
import pandas as pd

from tqdm import tqdm
from IPython.display import display, HTML

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.distance import geodesic
from geopy.distance import distance
from scipy.spatial import KDTree

In [24]:
#os.chdir('c:/Users/artur/OneDrive/Documents/TrabajoTesis') 
os.chdir('C:/Users/admin/OneDrive/Documents/TrabajoTesis')

In [3]:
df = pd.read_excel('Cluster/Faenas25.xlsx')

KeyboardInterrupt: 

In [80]:
# Limpiar nombres de columnas
df.columns = ['RutEmpresa' if col == 'RutEmpresa' else col.lstrip() for col in df.columns]

# Limpiar espacios dentro de las columnas (excepto RutEmpresa)
for col in df.select_dtypes(include='object').columns:
    if col != 'RutEmpresa':
        df[col] = df[col].str.strip()

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28319 entries, 0 to 28318
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RutEmpresa                  28319 non-null  object 
 1   NombreEmpresa               28319 non-null  object 
 2   RegionFaena                 28319 non-null  object 
 3   ProvinciaFaena              28319 non-null  object 
 4   ComunaFaena                 28319 non-null  object 
 5   NombreFaena                 28318 non-null  object 
 6   CategoriaFaena              28319 non-null  object 
 7   IdFaena                     28319 non-null  int64  
 8   RegionInstalacion           28319 non-null  object 
 9   ProvinciaInstalacion        28319 non-null  object 
 10  ComunaInstalacion           28319 non-null  object 
 11  NombreInstalacion           28319 non-null  object 
 12  IdTipoInstalacion           28319 non-null  int64  
 13  TipoInstalacion             283

In [82]:
# Convert lat/lon to radians
coords = np.radians(df[['Latitud', 'Longitud']].values)

# Build KDTree
tree = KDTree(coords)

# Radius (10 km → radians)
radius_km = 10
radius_rad = radius_km / 6371.0

# Prepare lists
nearby_counts = []
nearby_categories = []
categoriaA_counts = []

# Loop with progress bar
for i, coord in tqdm(enumerate(coords), total=len(coords), desc="🔍 Calculating 10km radius"):
    idxs = tree.query_ball_point(coord, r=radius_rad)
    idxs = [j for j in idxs if j != i]  # exclude self
    
    categorias_cercanas = df.loc[idxs, 'CategoriaFaena'].tolist()
    
    nearby_counts.append(len(idxs))
    nearby_categories.append(categorias_cercanas)
    
    # Count only 'A'
    categoriaA_counts.append(sum(cat == 'CATEGORIA A' for cat in categorias_cercanas))

# Add results to df
df['faenas_10km'] = nearby_counts
df['categorias_10km'] = nearby_categories
df['categoriaA_10km'] = categoriaA_counts

df.head()


🔍 Calculating 10km radius: 100%|██████████| 28319/28319 [00:15<00:00, 1796.01it/s]


Unnamed: 0,RutEmpresa,NombreEmpresa,RegionFaena,ProvinciaFaena,ComunaFaena,NombreFaena,CategoriaFaena,IdFaena,RegionInstalacion,ProvinciaInstalacion,...,Cota,Huso,Datum,IdEstado,Estado,Latitud,Longitud,faenas_10km,categorias_10km,categoriaA_10km
0,61703000-4,EMPRESA NACIONAL DE MINERIA (ENAMI),III,HUASCO,VALLENAR,PLANTA DE BENEFICIO DE MINERALES - VALLENAR,CATEGORIA B,153576,ATACAMA,HUASCO,...,538,19,PSAD-56,8,ACTIVA,-28.566725,-70.740064,72,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
1,76125921-0,CUPRUM RESOURCES CHILE SPA,IV,ELQUI,LA HIGUERA,LA VERDE,CATEGORIA D,20012162,COQUIMBO,ELQUI,...,1200,19,PSAD-56,169,INACTIVA,-29.43292,-70.683699,66,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
2,76362163-4,MINERA RUPERTO PATRICIO ORTIZ PALOMINOS,VI,COLCHAGUA,LOLOL,MINA LA VENUS,CATEGORIA D,20013211,LIBERTADOR GENERAL BERNARDO OHIGGINS,COLCHAGUA,...,134,19,PSAD-56,12,ABANDONADA,-34.677189,-71.498091,3,"[CATEGORIA D, CATEGORIA D, CATEGORIA C]",0
3,94638000-8,CIA. MRA. DEL PACIFICO S.A.,III,COPIAPO,TIERRA AMARILLA,PLANTA MAGNETITA,CATEGORIA A,33981,ATACAMA,COPIAPO,...,766,19,PSAD-56,8,ACTIVA,-27.520799,-70.320604,541,"[CATEGORIA D, CATEGORIA C, CATEGORIA D, CATEGO...",106
4,6891906-1,GARY NELSON MERCADO MALBRAN,III,CHAÑARAL,CHAÑARAL,DIEGO DAKAR 1/10,CATEGORIA D,20013237,ATACAMA,CHAÑARAL,...,531,19,PSAD-56,8,ACTIVA,-26.411292,-70.414104,164,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0


In [83]:
dfcata = df.loc[df['categoriaA_10km'] > 0, ['RutEmpresa', 'faenas_10km', 'categoriaA_10km', 'categorias_10km']]


In [84]:
# Ordenar por cantidad de CATEGORIA A (descendente)
dfcata_sorted = dfcata.sort_values(by='categoriaA_10km', ascending=False).reset_index(drop=True)
dfcata_sorted

Unnamed: 0,RutEmpresa,faenas_10km,categoriaA_10km,categorias_10km
0,77762940-9,202,200,"[CATEGORIA A, CATEGORIA A, CATEGORIA A, CATEGO..."
1,77762940-9,200,199,"[CATEGORIA A, CATEGORIA A, CATEGORIA A, CATEGO..."
2,77762940-9,200,199,"[CATEGORIA A, CATEGORIA A, CATEGORIA A, CATEGO..."
3,77762940-9,200,199,"[CATEGORIA A, CATEGORIA A, CATEGORIA A, CATEGO..."
4,77762940-9,200,199,"[CATEGORIA A, CATEGORIA A, CATEGORIA A, CATEGO..."
...,...,...,...,...
10366,76465515-K,58,1,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO..."
10367,78801520-8,183,1,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, SIN CA..."
10368,76011272-0,73,1,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO..."
10369,5956464-1,332,1,"[CATEGORIA C, CATEGORIA C, CATEGORIA C, CATEGO..."


### Limpieza

Nos quedamos solo con las faenas de producción de cobre

In [85]:
df = df[df['RecursoPrimarioInstalacion'] == 'COBRE']
df.shape

(21158, 30)

Dropeo de variables de poca relevancia, es decir que nos sea id o de ubicación, o que nos ayude a analizar en corto plazo

In [86]:
df = df.drop(columns=[
    'RegionFaena', 'ProvinciaFaena', 'ComunaFaena', 'NombreFaena', 'IdFaena',
    'ComunaInstalacion','ProvinciaInstalacion', 'NombreInstalacion', 'RecursoMineroInstalacion', 'RecursoPrimarioInstalacion',
    'TipoRecursoInstalacion', 'IdInstalacion', 'Norte', 'Este', 'Datum', 'IdEstado', 'IdTipoInstalacion'
])

In [87]:
df

Unnamed: 0,RutEmpresa,NombreEmpresa,CategoriaFaena,RegionInstalacion,TipoInstalacion,Cota,Huso,Estado,Latitud,Longitud,faenas_10km,categorias_10km,categoriaA_10km
0,61703000-4,EMPRESA NACIONAL DE MINERIA (ENAMI),CATEGORIA B,ATACAMA,TALLERES Y MAESTRANZA,538,19,ACTIVA,-28.566725,-70.740064,72,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
1,76125921-0,CUPRUM RESOURCES CHILE SPA,CATEGORIA D,COQUIMBO,EXPLORACION DE SUPERFICIE,1200,19,INACTIVA,-29.432920,-70.683699,66,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
2,76362163-4,MINERA RUPERTO PATRICIO ORTIZ PALOMINOS,CATEGORIA D,LIBERTADOR GENERAL BERNARDO OHIGGINS,MINA SUBTERRANEA,134,19,ABANDONADA,-34.677189,-71.498091,3,"[CATEGORIA D, CATEGORIA D, CATEGORIA C]",0
4,6891906-1,GARY NELSON MERCADO MALBRAN,CATEGORIA D,ATACAMA,MINA SUBTERRANEA,531,19,ACTIVA,-26.411292,-70.414104,164,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
12,76448510-6,SLM CASUALIDAD PRIMERA DE EL MANZANO,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,917,19,IRREGULAR OPERATIVA,-30.204069,-71.082232,711,"[CATEGORIA A, CATEGORIA D, CATEGORIA D, CATEGO...",71
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28311,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,CAMPAMENTO,1277,19,EN REVISIÓN DE PROYECTO,-31.216132,-71.267759,130,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
28312,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,1199,19,EN REVISIÓN DE PROYECTO,-31.220795,-71.265226,135,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
28313,6188182-4,EDILIO GOMEZ OYARZUN,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,458,19,EN REVISIÓN DE PROYECTO,-31.548972,-71.343902,51,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0
28314,78110627-5,C & C MINERIA SpA,CATEGORIA D,ANTOFAGASTA,POLVORIN,3250,19,EN REVISIÓN DE PROYECTO,-23.044930,-68.557016,3,"[CATEGORIA D, CATEGORIA D, CATEGORIA D]",0


In [88]:
df_sample = df #.sample(frac=0.05, random_state=42).copy() 


### Enriquesimiento

In [89]:
# Coordinates of the 15 cities
cities_coords = {
    "Santiago": (-33.4489, -70.6693),
    "Valparaíso": (-33.0472, -71.6127),
    "Concepción": (-36.8201, -73.0444),
    "La Serena": (-29.9037, -71.2489),
    "Coquimbo": (-29.9533, -71.3395),
    "Antofagasta": (-23.6500, -70.4000),
    "Temuco": (-38.7369, -72.5904),
    "Rancagua": (-34.1708, -70.7400),
    "Iquique": (-20.2307, -70.1350),
    "Puerto Montt": (-41.4694, -72.9420),
    "Talca": (-35.4261, -71.6550),
    "Arica": (-18.4783, -70.3126),
    "Chillán": (-36.6066, -72.1034),
    "Los Ángeles": (-37.4600, -72.3500),
    "Copiapó": (-27.3668, -70.3326),
    "Valdivia": (-39.8196, -73.2459)
}

# Function to find nearest city
def nearest_city(lat, lon):
    closest_city = min(cities_coords.keys(), key=lambda city: geodesic((lat, lon), cities_coords[city]).km)
    return closest_city

# Apply to your dataframe
tqdm.pandas()
df_sample["Closest_City"] = df_sample.progress_apply(
    lambda row: nearest_city(row["Latitud"], row["Longitud"]), axis=1
)

print(df_sample.head())

100%|██████████| 21158/21158 [00:54<00:00, 386.36it/s]

    RutEmpresa                            NombreEmpresa CategoriaFaena  \
0   61703000-4      EMPRESA NACIONAL DE MINERIA (ENAMI)    CATEGORIA B   
1   76125921-0               CUPRUM RESOURCES CHILE SPA    CATEGORIA D   
2   76362163-4  MINERA RUPERTO PATRICIO ORTIZ PALOMINOS    CATEGORIA D   
4    6891906-1              GARY NELSON MERCADO MALBRAN    CATEGORIA D   
12  76448510-6     SLM CASUALIDAD PRIMERA DE EL MANZANO    CATEGORIA D   

                       RegionInstalacion            TipoInstalacion  Cota  \
0                                ATACAMA      TALLERES Y MAESTRANZA   538   
1                               COQUIMBO  EXPLORACION DE SUPERFICIE  1200   
2   LIBERTADOR GENERAL BERNARDO OHIGGINS           MINA SUBTERRANEA   134   
4                                ATACAMA           MINA SUBTERRANEA   531   
12                              COQUIMBO           MINA SUBTERRANEA   917   

    Huso               Estado    Latitud   Longitud  faenas_10km  \
0     19               A




In [90]:

html = "<div style='height:300px; overflow:auto;'>" + df_sample['Closest_City'].value_counts().to_frame().to_html() + "</div>"
display(HTML(html))

Unnamed: 0_level_0,count
Closest_City,Unnamed: 1_level_1
Copiapó,6015
Valparaíso,4017
Coquimbo,3416
Antofagasta,2791
La Serena,2701
Santiago,1049
Iquique,465
Rancagua,374
Arica,313
Talca,14


In [91]:
df_sample

Unnamed: 0,RutEmpresa,NombreEmpresa,CategoriaFaena,RegionInstalacion,TipoInstalacion,Cota,Huso,Estado,Latitud,Longitud,faenas_10km,categorias_10km,categoriaA_10km,Closest_City
0,61703000-4,EMPRESA NACIONAL DE MINERIA (ENAMI),CATEGORIA B,ATACAMA,TALLERES Y MAESTRANZA,538,19,ACTIVA,-28.566725,-70.740064,72,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,Copiapó
1,76125921-0,CUPRUM RESOURCES CHILE SPA,CATEGORIA D,COQUIMBO,EXPLORACION DE SUPERFICIE,1200,19,INACTIVA,-29.432920,-70.683699,66,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,La Serena
2,76362163-4,MINERA RUPERTO PATRICIO ORTIZ PALOMINOS,CATEGORIA D,LIBERTADOR GENERAL BERNARDO OHIGGINS,MINA SUBTERRANEA,134,19,ABANDONADA,-34.677189,-71.498091,3,"[CATEGORIA D, CATEGORIA D, CATEGORIA C]",0,Talca
4,6891906-1,GARY NELSON MERCADO MALBRAN,CATEGORIA D,ATACAMA,MINA SUBTERRANEA,531,19,ACTIVA,-26.411292,-70.414104,164,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,Copiapó
12,76448510-6,SLM CASUALIDAD PRIMERA DE EL MANZANO,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,917,19,IRREGULAR OPERATIVA,-30.204069,-71.082232,711,"[CATEGORIA A, CATEGORIA D, CATEGORIA D, CATEGO...",71,La Serena
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28311,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,CAMPAMENTO,1277,19,EN REVISIÓN DE PROYECTO,-31.216132,-71.267759,130,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,Coquimbo
28312,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,1199,19,EN REVISIÓN DE PROYECTO,-31.220795,-71.265226,135,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,Coquimbo
28313,6188182-4,EDILIO GOMEZ OYARZUN,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,458,19,EN REVISIÓN DE PROYECTO,-31.548972,-71.343902,51,"[CATEGORIA D, CATEGORIA D, CATEGORIA D, CATEGO...",0,Valparaíso
28314,78110627-5,C & C MINERIA SpA,CATEGORIA D,ANTOFAGASTA,POLVORIN,3250,19,EN REVISIÓN DE PROYECTO,-23.044930,-68.557016,3,"[CATEGORIA D, CATEGORIA D, CATEGORIA D]",0,Antofagasta


In [None]:
import requests
import pandas as pd
from tqdm import tqdm

# Your API key 
API_KEY = 'YOUR_GOOGLE'

# City coordinates dictionary (from your dataset)
cities_coords = {
    "Santiago": (-33.4489, -70.6693),
    "Valparaíso": (-33.0472, -71.6127),
    "Concepción": (-36.8201, -73.0444),
    "La Serena": (-29.9037, -71.2489),
    "Coquimbo": (-29.9533, -71.3395),
    "Antofagasta": (-23.6500, -70.4000),
    "Temuco": (-38.7369, -72.5904),
    "Rancagua": (-34.1708, -70.7400),
    "Iquique": (-20.2307, -70.1350),
    "Puerto Montt": (-41.4694, -72.9420),
    "Talca": (-35.4261, -71.6550),
    "Arica": (-18.4783, -70.3126),
    "Chillán": (-36.6066, -72.1034),
    "Los Ángeles": (-37.4600, -72.3500),
    "Copiapó": (-27.3668, -70.3326),
    "Valdivia": (-39.8196, -73.2459)
}

# --- Load your data (or replace this with your DataFrame variable)
df_ciudades = df_sample.copy()

# --- Function to get distance and duration using Google Routes API
def get_route_info(origin_lat, origin_lon, dest_lat, dest_lon):
    url = "https://routes.googleapis.com/directions/v2:computeRoutes"
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": API_KEY,
        "X-Goog-FieldMask": "routes.distanceMeters,routes.duration"
    }
    body = {
        "origin": {"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}},
        "destination": {"location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}}},
        "travelMode": "DRIVE"
    }
    try:
        response = requests.post(url, headers=headers, json=body)
        data = response.json()
        if "routes" in data and len(data["routes"]) > 0:
            route = data["routes"][0]
            distance_km = route["distanceMeters"] / 1000
            duration_sec = int(route["duration"].replace("s", ""))
            return distance_km, duration_sec / 60  # convert to minutes
    except Exception as e:
        print(f"Error: {e}")
    return None, None

# --- Add distance and time columns
distances = []
durations = []

for _, row in tqdm(df_ciudades.iterrows(), total=len(df_ciudades)):
    city = row["Closest_City"]
    if city in cities_coords:
        city_lat, city_lon = cities_coords[city]
        dist, dur = get_route_info(row["Latitud"], row["Longitud"], city_lat, city_lon)
    else:
        dist, dur = None, None
    distances.append(dist)
    durations.append(dur)

df_ciudades["Distance_km"] = [round(d, 1) if d else None for d in distances]
df_ciudades["Travel_min"] = [round(t, 1) if t else None for t in durations]

print(df_ciudades[["NombreEmpresa", "Closest_City", "Distance_km", "Travel_min"]])



100%|██████████| 21158/21158 [17:37<00:00, 20.01it/s]

                                 NombreEmpresa Closest_City  Distance_km  \
0          EMPRESA NACIONAL DE MINERIA (ENAMI)      Copiapó        144.2   
1                   CUPRUM RESOURCES CHILE SPA    La Serena        130.5   
2      MINERA RUPERTO PATRICIO ORTIZ PALOMINOS        Talca        120.3   
4                  GARY NELSON MERCADO MALBRAN      Copiapó        144.2   
12        SLM CASUALIDAD PRIMERA DE EL MANZANO    La Serena         53.6   
...                                        ...          ...          ...   
28311                    BALFENI SOLUTIONS SPA     Coquimbo        246.5   
28312                    BALFENI SOLUTIONS SPA     Coquimbo        246.5   
28313                     EDILIO GOMEZ OYARZUN   Valparaíso        252.3   
28314                        C & C MINERIA SpA  Antofagasta          NaN   
28317                 ANA MARIA NOEMI MUNIZAGA      Copiapó         38.6   

       Travel_min  
0           103.4  
1           112.2  
2           102.4  
4      




In [98]:
import requests
import time
from tqdm import tqdm

# OSRM fallback function (same as before)
def osrm_route(origin_lat, origin_lon, dest_lat, dest_lon):
    url = f"http://router.project-osrm.org/route/v1/driving/{origin_lon},{origin_lat};{dest_lon},{dest_lat}?overview=false"
    try:
        response = requests.get(url)
        data = response.json()
        if "routes" in data and len(data["routes"]) > 0:
            route = data["routes"][0]
            distance_km = route["distance"] / 1000
            duration_min = route["duration"] / 60
            return distance_km, duration_min
    except:
        pass
    return None, None

# Only process rows where Travel_min is NA
na_rows = df_ciudades[df_ciudades["Travel_min"].isna()].copy()

for idx, row in tqdm(na_rows.iterrows(), total=len(na_rows)):
    city = row["Closest_City"]
    if city in cities_coords:
        city_lat, city_lon = cities_coords[city]
        dist, dur = osrm_route(row["Latitud"], row["Longitud"], city_lat, city_lon)
        df_ciudades.at[idx, "Distance_km"] = round(dist, 1) if dist else None
        df_ciudades.at[idx, "Travel_min"] = round(dur, 1) if dur else None
        time.sleep(0.05)  # small delay to avoid OSRM rate limit


100%|██████████| 1739/1739 [29:21<00:00,  1.01s/it]


In [282]:
# Count missing values per column
df_ciudades.isna().sum()

RutEmpresa           0
NombreEmpresa        0
CategoriaFaena       0
RegionInstalacion    0
TipoInstalacion      0
Cota                 0
Huso                 0
Estado               0
Latitud              0
Longitud             0
faenas_10km          0
categorias_10km      0
categoriaA_10km      0
Closest_City         0
Distance_km          0
Travel_min           0
dtype: int64

In [None]:
#df_ciudades.to_csv('Cluster/df_ciudades.csv', index=False)

In [4]:
df_ciudades = pd.read_csv('Cluster/df_ciudades.csv')

In [5]:
df = df_ciudades.copy()

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21158 entries, 0 to 21157
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   RutEmpresa         21158 non-null  object 
 1   NombreEmpresa      21158 non-null  object 
 2   CategoriaFaena     21158 non-null  object 
 3   RegionInstalacion  21158 non-null  object 
 4   TipoInstalacion    21158 non-null  object 
 5   Cota               21158 non-null  int64  
 6   Huso               21158 non-null  int64  
 7   Estado             21158 non-null  object 
 8   Latitud            21158 non-null  float64
 9   Longitud           21158 non-null  float64
 10  faenas_10km        21158 non-null  int64  
 11  categorias_10km    21158 non-null  object 
 12  categoriaA_10km    21158 non-null  int64  
 13  Closest_City       21158 non-null  object 
 14  Distance_km        21158 non-null  float64
 15  Travel_min         21158 non-null  float64
dtypes: float64(4), int64(4

In [7]:
desaladoras = pd.read_csv("Bases/plantas_desaladoras_combinado_final.csv")

In [8]:
desaladoras.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Empresa/Operador         51 non-null     object 
 1   Nombre de la Planta      51 non-null     object 
 2   Tipo de Planta           51 non-null     object 
 3   Región                   51 non-null     object 
 4   Uso/Aplicación           51 non-null     object 
 5   Estado Operacional       51 non-null     object 
 6   Capacidad (Valor)        51 non-null     float64
 7   Año de Puesta en Marcha  51 non-null     object 
 8   Latitude                 51 non-null     float64
 9   Longitude                51 non-null     float64
dtypes: float64(3), object(7)
memory usage: 4.1+ KB


In [30]:
estaciones = pd.read_csv("Bases/Estaciones.csv")

In [9]:
estaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       742 non-null    object 
 1   longitude  742 non-null    float64
 2   latitude   742 non-null    float64
dtypes: float64(2), object(1)
memory usage: 17.5+ KB


In [21]:
puertos = pd.read_csv("Bases/puertos.csv")

In [22]:
puertos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   portNumber  39 non-null     int64  
 1   portName    39 non-null     object 
 2   latitude    39 non-null     float64
 3   longitude   39 non-null     float64
 4   harborSize  39 non-null     object 
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [12]:
desaladoras.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Empresa/Operador         51 non-null     object 
 1   Nombre de la Planta      51 non-null     object 
 2   Tipo de Planta           51 non-null     object 
 3   Región                   51 non-null     object 
 4   Uso/Aplicación           51 non-null     object 
 5   Estado Operacional       51 non-null     object 
 6   Capacidad (Valor)        51 non-null     float64
 7   Año de Puesta en Marcha  51 non-null     object 
 8   Latitude                 51 non-null     float64
 9   Longitude                51 non-null     float64
dtypes: float64(3), object(7)
memory usage: 4.1+ KB


In [13]:
desaladoras = desaladoras[desaladoras["Uso/Aplicación"].isin(["Industrial","Minería"])]

In [20]:
df_sample = df_ciudades.copy()
#df_sample = df_sample.sample(frac=0.05, random_state=42).copy()

NameError: name 'df_ciudades' is not defined

In [None]:
df = df_sample.copy()

In [16]:
df.reset_index(drop=True, inplace=True)

In [18]:
import pandas as pd


In [21]:
df

Unnamed: 0,RutEmpresa,NombreEmpresa,CategoriaFaena,RegionInstalacion,TipoInstalacion,Cota,Huso,Estado,Latitud,Longitud,...,Distance_km,Travel_min,Empresa/Operador,Nombre de la Planta,Tipo de Planta,Uso/Aplicación,Capacidad (Valor),Año de Puesta en Marcha,travel_dist_km,travel_time_desal
0,61703000-4,EMPRESA NACIONAL DE MINERIA (ENAMI),CATEGORIA B,ATACAMA,TALLERES Y MAESTRANZA,538,19,ACTIVA,-28.566725,-70.740064,...,144.2,103.4,Guacolda,Huasco,Plantas deslinizadoras de agua de mar con capa...,Industrial,70.0,1997,51.758329,58.666667
1,76125921-0,CUPRUM RESOURCES CHILE SPA,CATEGORIA D,COQUIMBO,EXPLORACION DE SUPERFICIE,1200,19,INACTIVA,-29.432920,-70.683699,...,130.5,112.2,Nueva Unión,Nueva Unión,Iniciativas y proyectos en evaluación preliminar,Minería,970.0,ND,99.097720,137.216667
2,76362163-4,MINERA RUPERTO PATRICIO ORTIZ PALOMINOS,CATEGORIA D,LIBERTADOR GENERAL BERNARDO OHIGGINS,MINA SUBTERRANEA,134,19,ABANDONADA,-34.677189,-71.498091,...,120.3,102.4,Eléctrica Campiche SpA,Unidad 4 Ventanas,Plantas deslinizadoras de agua de mar con capa...,Industrial,30.0,2008,218.381473,247.316667
3,6891906-1,GARY NELSON MERCADO MALBRAN,CATEGORIA D,ATACAMA,MINA SUBTERRANEA,531,19,ACTIVA,-26.411292,-70.414104,...,144.2,122.0,Minera Mantos Copper,Minera Mantoverde,Plantas deslinizadoras de agua de mar con capa...,Minería,120.0,2014,31.494822,41.766667
4,76448510-6,SLM CASUALIDAD PRIMERA DE EL MANZANO,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,917,19,IRREGULAR OPERATIVA,-30.204069,-71.082232,...,53.6,59.7,Minera los Pelambres,Fase inicial Ampliación IV Los Pelambres,Plantas en construcción o prontas a entrar en ...,Minería,400.0,2023,190.208932,182.316667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21153,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,CAMPAMENTO,1277,19,EN REVISIÓN DE PROYECTO,-31.216132,-71.267759,...,246.5,191.2,Minera los Pelambres,Fase inicial Ampliación IV Los Pelambres,Plantas en construcción o prontas a entrar en ...,Minería,400.0,2023,74.081609,88.783333
21154,78131832-9,BALFENI SOLUTIONS SPA,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,1199,19,EN REVISIÓN DE PROYECTO,-31.220795,-71.265226,...,246.5,191.2,Minera los Pelambres,Fase inicial Ampliación IV Los Pelambres,Plantas en construcción o prontas a entrar en ...,Minería,400.0,2023,74.081609,88.783333
21155,6188182-4,EDILIO GOMEZ OYARZUN,CATEGORIA D,COQUIMBO,MINA SUBTERRANEA,458,19,EN REVISIÓN DE PROYECTO,-31.548972,-71.343902,...,252.3,197.9,Minera los Pelambres,Fase inicial Ampliación IV Los Pelambres,Plantas en construcción o prontas a entrar en ...,Minería,400.0,2023,39.419010,71.300000
21156,78110627-5,C & C MINERIA SpA,CATEGORIA D,ANTOFAGASTA,POLVORIN,3250,19,EN REVISIÓN DE PROYECTO,-23.044930,-68.557016,...,328.0,265.9,Compañía Minera Mantos La Luna,Mantos de la Luna,Plantas menores en operación,Industrial,5.0,2005,184.678253,263.763333


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21158 entries, 0 to 21157
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   RutEmpresa               21158 non-null  object 
 1   NombreEmpresa            21158 non-null  object 
 2   CategoriaFaena           21158 non-null  object 
 3   RegionInstalacion        21158 non-null  object 
 4   TipoInstalacion          21158 non-null  object 
 5   Cota                     21158 non-null  int64  
 6   Huso                     21158 non-null  int64  
 7   Estado                   21158 non-null  object 
 8   Latitud                  21158 non-null  float64
 9   Longitud                 21158 non-null  float64
 10  faenas_10km              21158 non-null  int64  
 11  categorias_10km          21158 non-null  object 
 12  categoriaA_10km          21158 non-null  int64  
 13  Closest_City             21158 non-null  object 
 14  Distance_km           

In [None]:
#df.to_csv('Cluster/df_faenas_y_desaladoras.csv', index=False)

In [25]:
puertos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   portNumber  39 non-null     int64  
 1   portName    39 non-null     object 
 2   latitude    39 non-null     float64
 3   longitude   39 non-null     float64
 4   harborSize  39 non-null     object 
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [25]:
df = pd.read_csv('Cluster/df_faenas_y_desaladoras.csv')

In [26]:
df_faenas = df.copy()

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm
import requests

# -----------------------------
# Parameters
# -----------------------------
RADIUS_KM = 5
TOP_N = 2  # closest ports to consider

# -----------------------------
# Helper functions
# -----------------------------
def haversine_distance(lat1, lon1, lat2, lon2):
    """Distance in km between two points."""
    return geodesic((lat1, lon1), (lat2, lon2)).km

def osrm_travel_time(lat1, lon1, lat2, lon2):
    """Travel time in minutes using OSRM."""
    try:
        url = f"http://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
        r = requests.get(url, timeout=10)
        data = r.json()
        if "routes" in data and len(data["routes"]) > 0:
            return data["routes"][0]["duration"] / 60  # minutes
    except:
        pass
    return None

# Initialize columns
string_cols = ['portName', 'harborSize']
numeric_cols = ['travel_dist_km', 'travel_time_min']

for col in string_cols:
    df_faenas[col] = pd.Series(dtype="object")
for col in numeric_cols:
    df_faenas[col] = np.nan

# -----------------------------
# 2️⃣ Assign port info by radius
# -----------------------------
unassigned_mask = df_faenas['travel_time_min'].isna()
pbar = tqdm(total=unassigned_mask.sum(), desc="Assigning port info")

while unassigned_mask.any():
    idx = df_faenas[unassigned_mask].index[0]
    point = df_faenas.loc[idx]
    lat0, lon0 = point['Latitud'], point['Longitud']

    # Find all mines within radius
    df_faenas['distance_to_point'] = df_faenas.apply(
        lambda x: haversine_distance(lat0, lon0, x['Latitud'], x['Longitud']), axis=1
    )
    in_radius = df_faenas['distance_to_point'] <= RADIUS_KM

    # Compute distances to ports
    puertos_copy = puertos.copy()
    puertos_copy['distance_km'] = puertos_copy.apply(
        lambda x: haversine_distance(lat0, lon0, x['latitude'], x['longitude']), axis=1
    )
    top_ports = puertos_copy.nsmallest(TOP_N, 'distance_km')

    # Compute OSRM travel times
    travel_times = []
    for _, port in top_ports.iterrows():
        t = osrm_travel_time(lat0, lon0, port['latitude'], port['longitude'])
        travel_times.append(t)
    top_ports = top_ports.copy()
    top_ports['travel_time'] = travel_times
    top_ports = top_ports.dropna(subset=['travel_time'])
    if top_ports.empty:
        df_faenas.loc[in_radius, 'travel_time_min'] = -1
        pbar.update(in_radius.sum())
        unassigned_mask = df_faenas['travel_time_min'].isna()
        continue

    # Pick port with minimum travel time
    best = top_ports.loc[top_ports['travel_time'].idxmin()]

    # Assign port info + distance + travel time to all mines in radius
    df_faenas.loc[in_radius, 'portName'] = best['portName']
    df_faenas.loc[in_radius, 'harborSize'] = best['harborSize']
    df_faenas.loc[in_radius, 'travel_dist_km'] = best['distance_km']
    df_faenas.loc[in_radius, 'travel_time_min'] = best['travel_time']

    unassigned_mask = df_faenas['travel_time_min'].isna()
    pbar.update(in_radius.sum())

pbar.close()
df_faenas.drop(columns=['distance_to_point'], inplace=True)
print("✅ Finished assigning port info to all mines")


Assigning port info: 31506it [1:57:35,  4.47it/s]                           

✅ Finished assigning port info to all mines





In [None]:
#df_faenas.to_csv('Cluster/df_faenas_y_desaladoras_y_puertos.csv', index=False)

In [31]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm
import requests

# -----------------------------
# Parameters
# -----------------------------
RADIUS_KM = 5
TOP_N = 2  # closest stations to consider

# -----------------------------
# Helper functions
# -----------------------------
def haversine_distance(lat1, lon1, lat2, lon2):
    """Distance in km between two points."""
    return geodesic((lat1, lon1), (lat2, lon2)).km

def osrm_travel_time(lat1, lon1, lat2, lon2):
    """Travel time in minutes using OSRM."""
    try:
        url = f"http://router.project-osrm.org/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
        r = requests.get(url, timeout=10)
        data = r.json()
        if "routes" in data and len(data["routes"]) > 0:
            return data["routes"][0]["duration"] / 60  # minutes
    except:
        pass
    return None

# -----------------------------
# 1️⃣ Initialize columns
# -----------------------------
string_cols = ['stationName']
numeric_cols = ['travel_dist_km', 'travel_time_min']

for col in string_cols:
    df_faenas[col] = pd.Series(dtype="object")
for col in numeric_cols:
    df_faenas[col] = np.nan

# -----------------------------
# 2️⃣ Assign station info by radius
# -----------------------------
unassigned_mask = df_faenas['travel_time_min'].isna()
pbar = tqdm(total=unassigned_mask.sum(), desc="Assigning station info")

while unassigned_mask.any():
    idx = df_faenas[unassigned_mask].index[0]
    point = df_faenas.loc[idx]
    lat0, lon0 = point['Latitud'], point['Longitud']

    # Find all mines within radius
    df_faenas['distance_to_point'] = df_faenas.apply(
        lambda x: haversine_distance(lat0, lon0, x['Latitud'], x['Longitud']), axis=1
    )
    in_radius = df_faenas['distance_to_point'] <= RADIUS_KM

    # Compute distances to stations
    estaciones_copy = estaciones.copy()
    estaciones_copy['distance_km'] = estaciones_copy.apply(
        lambda x: haversine_distance(lat0, lon0, x['latitude'], x['longitude']), axis=1
    )
    top_stations = estaciones_copy.nsmallest(TOP_N, 'distance_km')

    # Compute OSRM travel times
    travel_times = []
    for _, station in top_stations.iterrows():
        t = osrm_travel_time(lat0, lon0, station['latitude'], station['longitude'])
        travel_times.append(t)
    top_stations = top_stations.copy()
    top_stations['travel_time'] = travel_times
    top_stations = top_stations.dropna(subset=['travel_time'])
    if top_stations.empty:
        df_faenas.loc[in_radius, 'travel_time_min'] = -1
        pbar.update(in_radius.sum())
        unassigned_mask = df_faenas['travel_time_min'].isna()
        continue

    # Pick station with minimum travel time
    best = top_stations.loc[top_stations['travel_time'].idxmin()]

    # Assign station info + distance + travel time to all mines in radius
    df_faenas.loc[in_radius, 'stationName'] = best['name']
    df_faenas.loc[in_radius, 'travel_dist_km'] = best['distance_km']
    df_faenas.loc[in_radius, 'travel_time_min'] = best['travel_time']

    unassigned_mask = df_faenas['travel_time_min'].isna()
    pbar.update(in_radius.sum())

pbar.close()
df_faenas.drop(columns=['distance_to_point'], inplace=True)
print("✅ Finished assigning station info to all mines")


Assigning station info: 31506it [2:00:07,  4.37it/s]                           

✅ Finished assigning station info to all mines





In [None]:
#df_faenas.to_csv('Cluster/df_faenas_y_desaladoras_y_puertos_y_estaciones.csv', index=False)