In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from tqdm import tqdm
from pyproj import Transformer
import json

In [2]:
# Umbral de agrupación en km
threshold_km = 1.0

# Función para agrupar puntos de una misma especie por distancia
def agrupar_por_distancia(df_especie, threshold_km):
    coords = df_especie[['latitud', 'longitud']].values
    tree = cKDTree(np.radians(coords))  # lat/lon en radianes para coherencia
    groups = [-1] * len(coords)
    group_id = 0

    for i in range(len(coords)):
        if groups[i] != -1:
            continue
        groups[i] = group_id
        vecinos = tree.query_ball_point(np.radians(coords[i]), threshold_km / 6371.0)
        for v in vecinos:
            if groups[v] == -1:
                groups[v] = group_id
        group_id += 1

    df_especie = df_especie.copy()
    df_especie['grupo_geografico'] = groups

    agrupado = df_especie.groupby('grupo_geografico').agg({
        'latitud': 'mean',
        'longitud': 'mean',
        'total_individuos': 'sum',
        'fecha': 'first',
        'anio': 'first',
        'mes': 'first'
    }).reset_index()

    # Añadir taxonomía y especie
    tax_cols = ['especie', 'reino', 'filo', 'clase', 'orden', 'familia', 'genero']
    for col in tax_cols:
        agrupado[col] = df_especie[col].iloc[0]
    return agrupado


In [6]:
# Cargar el archivo CSV local
df = pd.read_csv("data/data_combinada_final.csv")
# Convertir la columna 'fecha' al tipo datetime usando el formato yyyy-mm-dd
df["fecha"] = pd.to_datetime(df["fecha"], format="%Y-%m-%d", errors="coerce")

# Lista de especies a excluir
especies_excluir = [
    "auratus", "bombetes", "fraterdanieli", "lehmanni",
    "nubicola", "pumilio", "subpunctatus", "truncatus", "virolinensis"
]

# Filtrar el DataFrame
df = df[~df["especie"].isin(especies_excluir)]

# Sumar la columna 'total_individuos'
suma_total_individuos = df["total_individuos"].sum()

# Imprimir el resultado
print("Suma total de individuos:", suma_total_individuos)

df


Suma total de individuos: 18705


Unnamed: 0,reino,filo,clase,orden,familia,genero,especie,latitud,longitud,fecha,mes,anio,total_individuos,cluster_dbscan,clusterkmean,clusterkmean_amb
1618,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Oophaga,histrionica,7.470000,-77.130000,1947-11-06,11,1947,1,2,1,6
1739,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Oophaga,histrionica,7.016670,-76.400000,1982-09-09,9,1982,10,3,1,6
1768,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Oophaga,histrionica,6.942000,-76.334000,1982-09-09,9,1982,4,3,1,6
1793,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Oophaga,histrionica,6.726389,-76.529167,2016-06-04,6,2016,1,3,1,6
1794,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Oophaga,histrionica,6.724068,-77.167031,1968-05-16,5,1968,417,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9045,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Ameerega,hahneli,-13.139272,-69.610490,2019-09-10,9,2019,1,6,2,5
9046,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Ameerega,picta,-13.139743,-69.613759,2008-01-07,1,2008,1,6,2,5
9047,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Ameerega,picta,-13.141700,-69.606700,1992-06-08,6,1992,1,6,2,5
9048,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Ameerega,trivittata,-13.142000,-69.607000,1992-05-01,5,1992,1,6,2,5


In [8]:
# Aplicar por especie
agrupado_distancia = []
for especie, grupo in tqdm(df.groupby('especie')):
    agrupado_distancia.append(agrupar_por_distancia(grupo, threshold_km))

agrupado_distancia_df = pd.concat(agrupado_distancia, ignore_index=True)

agrupado_distancia_df = agrupado_distancia_df.dropna(subset=["fecha"])  # elimina fechas nulas

# Guardar el resultado 
agrupado_distancia_df.to_csv("presencias_agrupadas_1km.csv", index=False)
print("Archivo guardado como presencias_agrupadas_1km.csv")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 39.97it/s]


Archivo guardado como presencias_agrupadas_1km.csv


In [9]:
# Sumar la columna 'total_individuos'
suma_total_individuos = agrupado_distancia_df["total_individuos"].sum()

# Imprimir el resultado
print("Suma total de individuos:", suma_total_individuos)
agrupado_distancia_df

Suma total de individuos: 18411


Unnamed: 0,grupo_geografico,latitud,longitud,total_individuos,fecha,anio,mes,especie,reino,filo,clase,orden,familia,genero
0,0,-1.770031,-79.181092,1,2008-09-01,2008,9,anthonyi,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,
1,1,-2.869340,-79.114810,1,1994-03-31,1994,3,anthonyi,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,
2,2,-2.979107,-79.697367,36,2018-07-14,2018,7,anthonyi,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,
3,3,-2.983120,-79.791120,25,2018-07-16,2018,7,anthonyi,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,
4,4,-3.011700,-79.704540,20,2018-07-16,2018,7,anthonyi,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,75,-1.139969,-76.047494,8,2000-07-01,2000,7,yasuni,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Hyloxalus
2780,76,-1.167833,-75.886667,1,2000-11-25,2000,11,yasuni,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Hyloxalus
2781,77,-1.247000,-76.366310,1,2012-09-21,2012,9,yasuni,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Hyloxalus
2782,78,-1.298472,-76.135699,1,2014-01-01,2014,1,yasuni,Animalia,Chordata,Amphibia,Anura,Dendrobatidae,Hyloxalus


### Tratamiento y generación datos red neuronal

Train PO

In [3]:
#Data con variables ambientales sacadas de GEE
data_presencia = pd.read_csv(rf"data_presencias_final.csv")

In [4]:
data_presencia

Unnamed: 0.1,Unnamed: 0,bio01,bio02,bio03,bio04,bio07,bio12,bio13,bio14,bio15,...,slope,aspect,hillshade,tri,watdist,landcover,especie,.geo,fecha,total_individuos
0,0,252.0,81.0,71.0,1138.0,114.0,783.0,153.0,23.0,72.0,...,0.0,297.0,181.0,5.0,4.472136,10.0,anthonyi,"{""type"":""Point"",""coordinates"":[-79.78801504321...",2017-01-26,1
1,1,239.0,95.0,82.0,589.0,115.0,973.0,175.0,17.0,70.0,...,4.0,305.0,191.0,470.0,4.123106,10.0,anthonyi,"{""type"":""Point"",""coordinates"":[-79.71175549700...",2025-01-14,1
2,2,223.0,107.0,90.0,241.0,118.0,1073.0,171.0,13.0,60.0,...,15.0,216.0,207.0,1139.0,1.414214,60.0,anthonyi,"{""type"":""Point"",""coordinates"":[-79.41458316639...",2008-01-11,10
3,3,228.0,105.0,92.0,161.0,114.0,1103.0,188.0,10.0,67.0,...,7.0,325.0,193.0,-1921.0,5.656854,10.0,anthonyi,"{""type"":""Point"",""coordinates"":[-79.58133570939...",2008-01-11,16
4,4,225.0,109.0,90.0,214.0,121.0,1262.0,241.0,7.0,84.0,...,11.0,341.0,192.0,-252.0,20.099751,10.0,anthonyi,"{""type"":""Point"",""coordinates"":[-79.72530224600...",2008-01-11,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2720,2720,250.0,85.0,79.0,673.0,107.0,2769.0,543.0,53.0,75.0,...,2.0,4.0,180.0,283.0,8.944272,10.0,sylvatica,"{""type"":""Point"",""coordinates"":[-79.24278292100...",2023-12-13,1
2721,2721,229.0,85.0,81.0,589.0,104.0,3932.0,674.0,113.0,63.0,...,1.0,317.0,182.0,114.0,13.601471,10.0,sylvatica,"{""type"":""Point"",""coordinates"":[-79.06005787804...",2020-12-10,1
2722,2722,232.0,85.0,80.0,639.0,105.0,4003.0,694.0,112.0,66.0,...,1.0,265.0,185.0,15.0,11.045361,10.0,sylvatica,"{""type"":""Point"",""coordinates"":[-79.11761595443...",2018-12-26,1
2723,2723,236.0,87.0,78.0,722.0,111.0,3262.0,574.0,70.0,74.0,...,1.0,162.0,179.0,-65.0,1.000000,10.0,sylvatica,"{""type"":""Point"",""coordinates"":[-79.17107611696...",2024-12-18,1


In [5]:
df = data_presencia.copy()

# extraer lat y lon
df[['lon', 'lat']] = df['.geo'].str.extract(r'\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]')
df['lon'] = df['lon'].astype(float)
df['lat'] = df['lat'].astype(float)

# Renombrar columnas para coincidir con la convención del código existente
df['x'] = df['lon']
df['y'] = df['lat']
df['spid'] = df['especie']

# Filtrar columnas relevantes
cols_env = [c for c in df.columns if c.startswith('bio') or c in ['slope', 'elev', 'aspect', 'hillshade', 'tri', 'cti', 'watdist', 'landcover']]
train_po = df[['spid', 'x', 'y'] + cols_env]

In [6]:
# Guardar
train_po.to_csv("ECOPALtrain_po.csv", index=False)

In [7]:
train_po

Unnamed: 0,spid,x,y,bio01,bio02,bio03,bio04,bio07,bio12,bio13,bio14,bio15,bio18,bio19,slope,aspect,hillshade,tri,watdist,landcover
0,anthonyi,-79.788015,-3.112903,252.0,81.0,71.0,1138.0,114.0,783.0,153.0,23.0,72.0,397.0,90.0,0.0,297.0,181.0,5.0,4.472136,10.0
1,anthonyi,-79.711755,-3.279959,239.0,95.0,82.0,589.0,115.0,973.0,175.0,17.0,70.0,377.0,76.0,4.0,305.0,191.0,470.0,4.123106,10.0
2,anthonyi,-79.414583,-3.342970,223.0,107.0,90.0,241.0,118.0,1073.0,171.0,13.0,60.0,172.0,410.0,15.0,216.0,207.0,1139.0,1.414214,60.0
3,anthonyi,-79.581336,-3.353208,228.0,105.0,92.0,161.0,114.0,1103.0,188.0,10.0,67.0,414.0,73.0,7.0,325.0,193.0,-1921.0,5.656854,10.0
4,anthonyi,-79.725302,-3.503221,225.0,109.0,90.0,214.0,121.0,1262.0,241.0,7.0,84.0,104.0,670.0,11.0,341.0,192.0,-252.0,20.099751,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2720,sylvatica,-79.242783,0.116111,250.0,85.0,79.0,673.0,107.0,2769.0,543.0,53.0,75.0,1420.0,203.0,2.0,4.0,180.0,283.0,8.944272,10.0
2721,sylvatica,-79.060058,0.113257,229.0,85.0,81.0,589.0,104.0,3932.0,674.0,113.0,63.0,1657.0,437.0,1.0,317.0,182.0,114.0,13.601471,10.0
2722,sylvatica,-79.117616,0.058740,232.0,85.0,80.0,639.0,105.0,4003.0,694.0,112.0,66.0,1733.0,401.0,1.0,265.0,185.0,15.0,11.045361,10.0
2723,sylvatica,-79.171076,-0.088749,236.0,87.0,78.0,722.0,111.0,3262.0,574.0,70.0,74.0,1682.0,247.0,1.0,162.0,179.0,-65.0,1.000000,10.0


RB

In [12]:
# Cargar puntos RB
rb_df = pd.read_csv(rf"data_pseudoausencias_final.csv")
pres_df = train_po.copy()

# Extraer coordenadas x (longitud) y y (latitud) desde la columna '.geo'
rb_df["x"] = rb_df[".geo"].apply(lambda g: json.loads(g)["coordinates"][0])
rb_df["y"] = rb_df[".geo"].apply(lambda g: json.loads(g)["coordinates"][1])

# Asegúrate de que no haya NaNs en presencias
pres_df = pres_df.dropna(subset=['x', 'y'])
rb_df = rb_df.dropna(subset=['x', 'y'])

In [24]:
# Convertir coordenadas geográficas (WGS84) a UTM para medir en metros
transformer = Transformer.from_crs("epsg:4326", "epsg:32717", always_xy=True)  # EPSG:32717 para Ecuador

# Convertir coordenadas
rb_coords = np.array(transformer.transform(rb_df['x'].values, rb_df['y'].values)).T
pres_coords = np.array(transformer.transform(pres_df['x'].values, pres_df['y'].values)).T

# Crear árbol para búsqueda eficiente
tree = cKDTree(pres_coords)

# Buscar vecinos más cercanos (hasta 1000m)
distances, _ = tree.query(rb_coords, distance_upper_bound=1000)

# Filtrar puntos más lejanos a 1km
rb_df_filtrado = rb_df[np.isinf(distances)]

# Filtrar solo Ecuador continental
rb_df_filtrado = rb_df_filtrado[
    (rb_df_filtrado['y'] >= -5.0) & (rb_df_filtrado['y'] <= 1.5) &
    (rb_df_filtrado['x'] >= -81.0) & (rb_df_filtrado['x'] <= -75.0)
]

print(f"Puntos filtrados (RB no cercanos a presencias): {len(rb_df_filtrado)}")

Puntos filtrados (RB no cercanos a presencias): 5689


In [25]:
# Eliminar columnas innecesarias
rb_df_filtrado = rb_df_filtrado.drop(columns=['Unnamed: 0','.geo'])
output_path = "ECOPAL_RB_pseudoausencias_final.csv"
rb_df_filtrado.to_csv(output_path, index=False)
rb_df_filtrado

Unnamed: 0,aspect,bio01,bio02,bio03,bio04,bio07,bio12,bio13,bio14,bio15,bio18,bio19,hillshade,landcover,slope,tri,watdist,x,y
0,39.0,248.0,99.0,87.0,487.0,113.0,3200.0,349.0,191.0,19.0,827.0,822.0,178.0,10.0,1.0,-10.0,23.259407,-76.906233,-1.129835
1,101.0,165.0,97.0,84.0,214.0,115.0,2369.0,381.0,48.0,64.0,966.0,216.0,146.0,10.0,11.0,273.0,14.142136,-78.977868,-0.450778
2,231.0,227.0,98.0,83.0,585.0,118.0,2290.0,244.0,125.0,20.0,471.0,627.0,183.0,10.0,1.0,17.0,3.000000,-78.159434,-2.477687
3,224.0,225.0,99.0,87.0,420.0,113.0,4179.0,466.0,221.0,19.0,1047.0,1136.0,201.0,10.0,10.0,-261.0,32.449961,-77.641297,-1.533992
4,196.0,248.0,92.0,70.0,1298.0,131.0,498.0,141.0,4.0,114.0,319.0,21.0,181.0,10.0,1.0,-23.0,2.000000,-80.258042,-2.485497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5994,135.0,254.0,92.0,76.0,971.0,120.0,1305.0,330.0,1.0,119.0,873.0,9.0,180.0,40.0,0.0,-4.0,1.000000,-79.624440,-1.928143
5995,161.0,89.0,105.0,89.0,219.0,117.0,792.0,105.0,43.0,28.0,147.0,164.0,167.0,30.0,13.0,1047.0,8.944272,-78.572685,-2.099116
5996,162.0,219.0,128.0,90.0,269.0,142.0,1053.0,268.0,4.0,107.0,653.0,27.0,167.0,10.0,14.0,-1538.0,15.000000,-79.901912,-4.349769
5997,120.0,120.0,105.0,86.0,377.0,121.0,1099.0,141.0,58.0,28.0,256.0,208.0,150.0,10.0,11.0,-1111.0,27.166155,-78.998397,-4.300856


TGB

In [26]:
pres_df= train_po.copy()
# 1. Crear siteid único por coordenada
pres_df['siteid'] = pres_df.apply(lambda row: f"{row['x']:.5f}_{row['y']:.5f}", axis=1)

# 2. Crear matriz binaria de presencia-ausencia (test_pa)
test_pa_df = pres_df.pivot_table(index='siteid', columns='spid', aggfunc=lambda x: 1, fill_value=0).reset_index()
test_pa_df.columns.name = None  # limpiar el nombre del índice de columnas

# 3. Crear test_env directamente desde pres_df (sin buscar proxy)
# Seleccionamos una fila por siteid
env_vars = [col for col in pres_df.columns if col not in ['x', 'y', 'spid', 'siteid']]
test_env_df = pres_df.drop(columns='spid').drop_duplicates(subset=['x', 'y']).copy()
test_env_df['siteid'] = test_env_df.apply(lambda row: f"{row['x']:.5f}_{row['y']:.5f}", axis=1)
test_env_df = test_env_df[['siteid', 'x', 'y'] + env_vars]  # reordenar

# 4. Guardar archivos
test_pa_df.to_csv("ECOPALtest_pa.csv", index=False)
test_env_df.to_csv("ECOPALtest_env.csv", index=False)