# Limpiar Coordenadas para Drought

## Importar librerías

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import SimpleImputer
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from time import sleep

In [28]:
geolocator = Nominatim(user_agent="MozillaFirefox", timeout=10)

## Cargar base de datos

03desastreslimpio.csv

In [29]:
df = pd.read_csv('../../Data/03Limpio/03desastreslimpio.csv')
df.head(10)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
0,1900-9002-CPV,1900,9002,Climatological,Drought,Drought,Cabo Verde,CPV,Western Africa,Africa,...,,1900,0,0,1900,0,0,11000,0,0
1,1900-9001-IND,1900,9001,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,...,,1900,0,0,1900,0,0,1250000,0,0
2,1902-0012-GTM,1902,12,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,...,-91.0,1902,4,18,1902,4,18,2000,0,843726
3,1902-0003-GTM,1902,3,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,4,8,1902,4,8,1000,0,0
4,1902-0010-GTM,1902,10,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,10,24,1902,10,24,6000,0,0
5,1903-0006-CAN,1903,6,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1903,4,29,1903,4,29,76,23,0
6,1903-0012-COM,1903,12,Geophysical,Volcanic activity,Ash fall,Comoros (the),COM,Eastern Africa,Africa,...,,1903,0,0,1903,0,0,17,0,0
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,,1904,11,0,1904,11,0,0,0,0
8,1905-0005-CAN,1905,5,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1905,8,13,1905,8,13,18,18,0
9,1905-0003-IND,1905,3,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,...,76.16,1905,4,4,1905,4,4,20000,0,812477


## EDA Inicial Básico

In [30]:
df.shape

(16636, 26)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16636 entries, 0 to 16635
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Dis No             16636 non-null  object
 1   Year               16636 non-null  int64 
 2   Seq                16636 non-null  int64 
 3   Disaster Subgroup  16636 non-null  object
 4   Disaster Type      16636 non-null  object
 5   Disaster Subtype   13313 non-null  object
 6   Country            16636 non-null  object
 7   ISO                16636 non-null  object
 8   Region             16636 non-null  object
 9   Continent          16636 non-null  object
 10  Location           14825 non-null  object
 11  Origin             4085 non-null   object
 12  Associated Dis     3593 non-null   object
 13  Dis Mag Value      16636 non-null  int64 
 14  Dis Mag Scale      15416 non-null  object
 15  Latitude           2775 non-null   object
 16  Longitude          2775 non-null   objec

## Filtro Disaster Type == Drought

In [32]:
filter = df['Disaster Type'] == 'Drought'
df_drought = df[filter]

In [33]:
df_drought.head(2)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
0,1900-9002-CPV,1900,9002,Climatological,Drought,Drought,Cabo Verde,CPV,Western Africa,Africa,...,,1900,0,0,1900,0,0,11000,0,0
1,1900-9001-IND,1900,9001,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,...,,1900,0,0,1900,0,0,1250000,0,0


In [34]:
df_drought.shape

(803, 26)

In [35]:
df_drought['Disaster Type'].dtype

dtype('O')

In [36]:
df_drought['Latitude'].isnull().sum()

803

In [37]:
df_drought['Longitude'].isnull().sum()

803

# Limpieza de coordenadas

## Función para limpiar coordenadas filtrado para df_drought

In [38]:
# Initializar geolocator
geolocator = Nominatim(user_agent="eqlimpiarcoord")

# Convertir 'Latitude' y 'Longitude' a string
df_drought['Latitude'] = df_drought['Latitude'].astype(str)
df_drought['Longitude'] = df_drought['Longitude'].astype(str)

# Limpiar coordenadas de latitude y longitude
df_drought['Latitude'] = df_drought['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
df_drought['Longitude'] = df_drought['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x))

# Remover puntos finales
df_drought['Latitude'] = df_drought['Latitude'].str.rstrip('.')
df_drought['Longitude'] = df_drought['Longitude'].str.rstrip('.')

# Añadir valores anómalos de latitud y longitud a listas
anomalous_lat = []
anomalous_lon = []

# Función para convertir coordenadas y detectar anomalías
def convert_coordinates(x, convert_nan=True):
    if not x:
        return np.nan

    # Remover caracteres 'N' y 'E'
    x = x.replace(' N', '').replace(' E', '')

    # Remover puntos extra en decimales
    x = re.sub('^(\d+\.\d{2})\..*', r'\1', x)

    # Conversión a negativo para S y W
    try:
        value = float(x)
        if x[-1] == 'S' or x[-1] == 'W':
            return -value
        else:
            return value
    except ValueError:
        return np.nan

# Convertir a float
df_drought['Latitude'] = df_drought['Latitude'].apply(convert_coordinates)
df_drought['Longitude'] = df_drought['Longitude'].apply(convert_coordinates)

# Redondear decimales
df_drought['Latitude'] = df_drought['Latitude'].round(2)
df_drought['Longitude'] = df_drought['Longitude'].round(2)

# Identificar valores anómalos de latitude y longitude
for index, row in df_drought.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']

    if latitude < -90 or latitude > 90:
        anomalous_lat.append(index)

    if longitude < -180 or longitude > 180:
        anomalous_lon.append(index)

# Revisar inconsistencias de cantidad entre anómalos de latitude y longitude
if len(anomalous_lat) != len(anomalous_lon):
    print("Valores inconsistentes de latitud y longitud.")
    # Gestionar la inconsistencia, como remover los valores extras o ajustar las listas
    # Por ejemplo, remover los valores extra de latitud:
    anomalous_lat = anomalous_lat[:len(anomalous_lon)]
    print("Longitud de latitud anómala ajustada:", len(anomalous_lat))

# Crear nuevo DataFrame con coordenadas limpias y emparejadas
df_drcleaned = df_drought.copy()

# Función para rellenar coordenadas anómalas usando geocoding
def fill_anomalous_coordinates(row):
    if row.name in anomalous_lat and row.name in anomalous_lon:
        try:
            location = geolocator.reverse((row['Latitude'], row['Longitude']), timeout=10)
            if location and location.latitude is not None and location.longitude is not None:
                row['Latitude'] = location.latitude
                row['Longitude'] = location.longitude
                row['Location'] = location.address
                row['Country'] = location.raw['address'].get('country')
        except:
            pass
    return row

# Rellenar coordenadas anómalas
df_drcleaned = df_drcleaned.apply(fill_anomalous_coordinates, axis=1)

# Borrar filas con location y country no emparejadas
df_drcleaned = df_drcleaned[~((df_drcleaned['Latitude'].isin(anomalous_lat)) & (df_drcleaned['Longitude'].isin(anomalous_lon)))]

# Borrar filas con valores null o nan en columnas de Latitud y Longitud
df_drcleaned = df_drcleaned.dropna(subset=['Latitude', 'Longitude'])

# Convertir Latitud y Longitud a float con 2 decimales
df_drcleaned['Latitude'] = df_drcleaned['Latitude'].astype(float).round(2)
df_drcleaned['Longitude'] = df_drcleaned['Longitude'].astype(float).round(2)

# # Exportar el DataFrame limpio a archivo
# df_drcleaned.to_csv('droughtfilteredcleanedgeoloc.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drought['Latitude'] = df_drought['Latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drought['Longitude'] = df_drought['Longitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drought['Latitude'] = df_drought['Latitude'].apply(lambda x: re.sub('[^\d.-]', '

1. Definimos la función fill_anomalous_latlon que toma una fila como entrada. Si los valores de latitud y longitud de la fila se encuentran en las listas anomalous_lat y anomalous_lon, respectivamente, la función intenta geocodificar la ubicación mediante la columna Location o la columna Country. Si se encuentra una ubicación válida, se actualizan los valores de latitud y longitud de la fila.

2. La llamada a la función sleep(1) agrega un retraso de 1 segundo entre las solicitudes de geolocalización para cumplir con la política de uso del servicio de geocodificación.

3. Luego creamos un objeto geolocalizador utilizando el geocodificador Nominatim de la biblioteca geopy.

4. Finalmente, aplicamos la función fill_anomalous_latlon a cada fila del DataFrame usando el método apply con axis=1, lo que indica que la función debe aplicarse por filas.

## Función con geopy para limpiar coordenadas

## Tests de verificación de limpieza de coordenadas

In [39]:
df_drought[['Latitude', 'Longitude', 'Location', 'Country']][700:800:]

Unnamed: 0,Latitude,Longitude,Location,Country
13770,,,"Western Bahr El Ghazal, Northern Bahr El Ghaza...",South Sudan
13795,,,"Baucau, Lautem, Viqueque provinces",Timor-Leste
13820,,,"Yap, Chuuk regions",Micronesia (Federated States of)
13864,,,"Seven provinces in the South (Cunene, Huila, N...",Angola
13912,,,"Northeast China, North China Plain, Inner Mong...",China
...,...,...,...,...
16438,,,"Napak, Kaabong, Kotido, Moroto districts (Kara...",Uganda
16449,,,"Emilie-Romagne, Frioul-Venetie Julienne, Lomba...",Italy
16472,,,,Mali
16479,,,,Malawi


In [40]:
# Filtrar DataFrame para detectar valores anómalos de latitud y longitud
df_eqanomalous = df_drought[
    df_drought['Latitude'].isin(anomalous_lat) | df_drought['Longitude'].isin(anomalous_lon)
]

# Crear pivot table para comparar las columnas de latitude, longitude, location, y country
df_eqanomalous_pivot = df_eqanomalous.pivot_table(
    index=['Location', 'Country'],
    values=['Latitude', 'Longitude'],
    aggfunc='first'
)

# Mostrar la pivot table
print(df_eqanomalous_pivot)


Empty DataFrame
Columns: []
Index: []


In [41]:
print(df_drought['Latitude'])
print(df_drought['Longitude'])

0       NaN
1       NaN
24      NaN
27      NaN
30      NaN
         ..
16479   NaN
16486   NaN
16506   NaN
16534   NaN
16557   NaN
Name: Latitude, Length: 803, dtype: float64
0       NaN
1       NaN
24      NaN
27      NaN
30      NaN
         ..
16479   NaN
16486   NaN
16506   NaN
16534   NaN
16557   NaN
Name: Longitude, Length: 803, dtype: float64


In [42]:
print(df_drought['Latitude'].dtype)
print(df_drought['Longitude'].dtype)

float64
float64


In [43]:
# # Examinar filas específicas para verificar si el proceso de limpieza ha manejado los valores anómalos correctamente.
# print(df_drought.loc[6000, 'Latitude'])
# print(df_drought.loc[6000, 'Longitude'])

In [44]:
# Valores nulos de Longitude y Latitude
df_drought[['Longitude', 'Latitude']].isnull().sum()

Longitude    803
Latitude     803
dtype: int64

## Valores Anómalos de Latitud y Longitud

In [45]:
len(anomalous_lat)

0

In [46]:
len(anomalous_lon)

0

#### Índices Anómalos de Latitud y longitud

In [47]:
anomalous_lat_idx = []
for index, lat in enumerate(anomalous_lat):
    if lat < -90 or lat > 90:
        anomalous_lat_idx.append(index)

print("Índices Anómalos de Latitud:", anomalous_lat_idx)

Índices Anómalos de Latitud: []


In [48]:
anomalous_lon_idx = []
for index, lon in enumerate(anomalous_lon):
    if lat < -90 or lat > 90:
        anomalous_lon_idx.append(index)

print("Índices Anómalos de Longitud:", anomalous_lon_idx)

Índices Anómalos de Longitud: []


## Outliers de Latitude y Longitude

In [49]:
# sns.boxplot(df_drought['Latitude'])

In [50]:
# sns.boxplot(df_drought['Longitude'])

## Valores faltantes

In [51]:
df_drought.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 803 entries, 0 to 16557
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dis No             803 non-null    object 
 1   Year               803 non-null    int64  
 2   Seq                803 non-null    int64  
 3   Disaster Subgroup  803 non-null    object 
 4   Disaster Type      803 non-null    object 
 5   Disaster Subtype   802 non-null    object 
 6   Country            803 non-null    object 
 7   ISO                803 non-null    object 
 8   Region             803 non-null    object 
 9   Continent          803 non-null    object 
 10  Location           632 non-null    object 
 11  Origin             165 non-null    object 
 12  Associated Dis     290 non-null    object 
 13  Dis Mag Value      803 non-null    int64  
 14  Dis Mag Scale      803 non-null    object 
 15  Latitude           0 non-null      float64
 16  Longitude          0 non

In [52]:
df_drought.isnull().sum()

Dis No                 0
Year                   0
Seq                    0
Disaster Subgroup      0
Disaster Type          0
Disaster Subtype       1
Country                0
ISO                    0
Region                 0
Continent              0
Location             171
Origin               638
Associated Dis       513
Dis Mag Value          0
Dis Mag Scale          0
Latitude             803
Longitude            803
Start Year             0
Start Month            0
Start Day              0
End Year               0
End Month              0
End Day                0
Total Deaths           0
Total Affected         0
Total Damages Adj      0
dtype: int64

## Imputar valores anómalos con SimpleImputer

In [53]:
# imputer = SimpleImputer(strategy='most_frequent')

# # Seleccionar filas donde 'Longitude' está (isin) la lista de anomalous_lon
# anomalous_lat_to_impute = df_drought['Latitude'].isin(anomalous_lat)

# # Imputar los valores con la media a anomalous_lon en 'Longitude'
# df_drought.loc[anomalous_lat_to_impute, 'Latitude'] = imputer.fit_transform(df_drought[['Latitude']])[anomalous_lat_to_impute]
# df_drought['Latitude'].isnull().sum()

# No permite imputar. ValueError: Must have equal len keys and value when setting with an ndarray

ValueError: Must have equal len keys and value when setting with an ndarray

In [54]:
# imputer = SimpleImputer(strategy='most_frequent')

# # Seleccionar filas donde 'Longitude' está (isin) la lista de anomalous_lon
# anomalous_lon_to_impute = df_drought['Longitude'].isin(anomalous_lon)

# # Imputar los valores anómalos con la media a anomalous_lon en 'Longitude'
# df_drought.loc[anomalous_lon_to_impute, 'Longitude'] = imputer.fit_transform(df_drought[['Longitude']])[anomalous_lon_to_impute]
# df_drought['Longitude'].isnull()

# No permite imputar. ValueError: Must have equal len keys and value when setting with an ndarray


ValueError: Must have equal len keys and value when setting with an ndarray

Imputar mediana a nulos con SimpleImputer a Longitude y Latitude

In [None]:
imputer = SimpleImputer(strategy='median')
# Seleccionar filas donde 'Longitude' sea null (NaN)
null_lon_rows = df_drought['Latitude'].isnull()
# Imputar los nulls en 'Longitude' estrategia mean
df_drought.loc[null_lon_rows, 'Latitude'] = imputer.fit_transform(df_drought[['Latitude']])[null_lon_rows]
df_drought['Latitude'].isnull().sum()

In [None]:
imputer = SimpleImputer(strategy='median')
# Seleccionar filas donde 'Longitude' sea null (NaN)
null_lon_rows = df_drought['Longitude'].isnull()
# Imputar los nulls en 'Longitude' estrategia mean
df_drought.loc[null_lon_rows, 'Longitude'] = imputer.fit_transform(df_drought[['Longitude']])[null_lon_rows]
df_drought['Longitude'].isnull().sum()

In [None]:
df_drought.head()

# Pivot Tables

In [None]:
df_drought_origin_geolocation_pivot = df_drought.pivot_table(
    index='Origin',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_drought_origin_geolocation_pivot

In [None]:
df_drought_year_geolocation_pivot = df_drought.pivot_table(
    index='Year',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_drought_year_geolocation_pivot

# Guardar dataset Drought limpio en csv

In [None]:
df.to_csv('../../Data/03Limpio/DroughtLimpioCSV/03desastres_4droughtcoordlimpias.csv')