# Limpiar Coordenadas para Earthquake

## Importar librerías

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import re
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from time import sleep

In [54]:
geolocator = Nominatim(user_agent="MozillaFirefox", timeout=10)

## Cargar base de datos

03desastreslimpio.csv

In [55]:
df = pd.read_csv('../../Data/03Limpio/03desastreslimpio.csv')
df.head(10)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
0,1900-9002-CPV,1900,9002,Climatological,Drought,Drought,Cabo Verde,CPV,Western Africa,Africa,...,,1900,0,0,1900,0,0,11000,0,0
1,1900-9001-IND,1900,9001,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,...,,1900,0,0,1900,0,0,1250000,0,0
2,1902-0012-GTM,1902,12,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,...,-91.0,1902,4,18,1902,4,18,2000,0,843726
3,1902-0003-GTM,1902,3,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,4,8,1902,4,8,1000,0,0
4,1902-0010-GTM,1902,10,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,10,24,1902,10,24,6000,0,0
5,1903-0006-CAN,1903,6,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1903,4,29,1903,4,29,76,23,0
6,1903-0012-COM,1903,12,Geophysical,Volcanic activity,Ash fall,Comoros (the),COM,Eastern Africa,Africa,...,,1903,0,0,1903,0,0,17,0,0
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,,1904,11,0,1904,11,0,0,0,0
8,1905-0005-CAN,1905,5,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1905,8,13,1905,8,13,18,18,0
9,1905-0003-IND,1905,3,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,...,76.16,1905,4,4,1905,4,4,20000,0,812477


## EDA Inicial Básico

In [56]:
df.shape

(16636, 26)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16636 entries, 0 to 16635
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Dis No             16636 non-null  object
 1   Year               16636 non-null  int64 
 2   Seq                16636 non-null  int64 
 3   Disaster Subgroup  16636 non-null  object
 4   Disaster Type      16636 non-null  object
 5   Disaster Subtype   13313 non-null  object
 6   Country            16636 non-null  object
 7   ISO                16636 non-null  object
 8   Region             16636 non-null  object
 9   Continent          16636 non-null  object
 10  Location           14825 non-null  object
 11  Origin             4085 non-null   object
 12  Associated Dis     3593 non-null   object
 13  Dis Mag Value      16636 non-null  int64 
 14  Dis Mag Scale      15416 non-null  object
 15  Latitude           2775 non-null   object
 16  Longitude          2775 non-null   objec

## Filtro Disaster Type == Earthquake

In [58]:
filter = df['Disaster Type'] == 'Earthquake'
df_earthquake = df[filter]

In [59]:
df_earthquake.head(2)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
2,1902-0012-GTM,1902,12,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,...,-91.0,1902,4,18,1902,4,18,2000,0,843726
9,1905-0003-IND,1905,3,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,...,76.16,1905,4,4,1905,4,4,20000,0,812477


In [60]:
df_earthquake.shape

(1597, 26)

In [61]:
df_earthquake['Disaster Type'].dtype

dtype('O')

# Limpieza de coordenadas

## Función para limpiar coordenadas filtrado para df_earthquake

In [62]:
# Initializar geolocator
geolocator = Nominatim(user_agent="eqlimpiarcoord")

# Convertir 'Latitude' y 'Longitude' a string
df_earthquake['Latitude'] = df_earthquake['Latitude'].astype(str)
df_earthquake['Longitude'] = df_earthquake['Longitude'].astype(str)

# Limpiar coordenadas de latitude y longitude
df_earthquake['Latitude'] = df_earthquake['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
df_earthquake['Longitude'] = df_earthquake['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x))

# Remover puntos finales
df_earthquake['Latitude'] = df_earthquake['Latitude'].str.rstrip('.')
df_earthquake['Longitude'] = df_earthquake['Longitude'].str.rstrip('.')

# Añadir valores anómalos de latitud y longitud a listas
anomalous_lat = []
anomalous_lon = []

# Función para convertir coordenadas y detectar anomalías
def convert_coordinates(x, convert_nan=True):
    if not x:
        return np.nan

    # Remover caracteres 'N' y 'E'
    x = x.replace(' N', '').replace(' E', '')

    # Remover puntos extra en decimales
    x = re.sub('^(\d+\.\d{2})\..*', r'\1', x)

    # Conversión a negativo para S y W
    try:
        value = float(x)
        if x[-1] == 'S' or x[-1] == 'W':
            return -value
        else:
            return value
    except ValueError:
        return np.nan

# Convertir a float
df_earthquake['Latitude'] = df_earthquake['Latitude'].apply(convert_coordinates)
df_earthquake['Longitude'] = df_earthquake['Longitude'].apply(convert_coordinates)

# Redondear decimales
df_earthquake['Latitude'] = df_earthquake['Latitude'].round(2)
df_earthquake['Longitude'] = df_earthquake['Longitude'].round(2)

# Identificar valores anómalos de latitude y longitude
for index, row in df_earthquake.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']

    if latitude < -90 or latitude > 90:
        anomalous_lat.append(index)

    if longitude < -180 or longitude > 180:
        anomalous_lon.append(index)

# Revisar inconsistencias de cantidad entre anómalos de latitude y longitude
if len(anomalous_lat) != len(anomalous_lon):
    print("Valores inconsistentes de latitud y longitud.")
    # Gestionar la inconsistencia, como remover los valores extras o ajustar las listas
    # Por ejemplo, remover los valores extra de latitud:
    anomalous_lat = anomalous_lat[:len(anomalous_lon)]
    print("Longitud de latitud anómala ajustada:", len(anomalous_lat))

# Crear nuevo DataFrame con coordenadas limpias y emparejadas
df_eqcleaned = df_earthquake.copy()

# Función para rellenar coordenadas anómalas usando geocoding
def fill_anomalous_coordinates(row):
    if row.name in anomalous_lat and row.name in anomalous_lon:
        try:
            location = geolocator.reverse((row['Latitude'], row['Longitude']), timeout=10)
            if location and location.latitude is not None and location.longitude is not None:
                row['Latitude'] = location.latitude
                row['Longitude'] = location.longitude
                row['Location'] = location.address
                row['Country'] = location.raw['address'].get('country')
        except:
            pass
    return row

# Rellenar coordenadas anómalas
df_eqcleaned = df_eqcleaned.apply(fill_anomalous_coordinates, axis=1)

# Borrar filas con location y country no emparejadas
df_eqcleaned = df_eqcleaned[~((df_eqcleaned['Latitude'].isin(anomalous_lat)) & (df_eqcleaned['Longitude'].isin(anomalous_lon)))]

# Borrar filas con valores null o nan en columnas de Latitud y Longitud
df_eqcleaned = df_eqcleaned.dropna(subset=['Latitude', 'Longitude'])

# Convertir Latitud y Longitud a float con 2 decimales
df_eqcleaned['Latitude'] = df_eqcleaned['Latitude'].astype(float).round(2)
df_eqcleaned['Longitude'] = df_eqcleaned['Longitude'].astype(float).round(2)

# Exportar el DataFrame limpio a archivo
df_eqcleaned.to_csv('earthquakefilteredcleanedgeoloc.csv', index=False)


Valores inconsistentes de latitud y longitud.
Longitud de latitud anómala ajustada: 31


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_earthquake['Latitude'] = df_earthquake['Latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_earthquake['Longitude'] = df_earthquake['Longitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_earthquake['Latitude'] = df_earthquake['Latitude'].apply(lambda x: r

1. Definimos la función fill_anomalous_latlon que toma una fila como entrada. Si los valores de latitud y longitud de la fila se encuentran en las listas anomalous_lat y anomalous_lon, respectivamente, la función intenta geocodificar la ubicación mediante la columna Location o la columna Country. Si se encuentra una ubicación válida, se actualizan los valores de latitud y longitud de la fila.

2. La llamada a la función sleep(1) agrega un retraso de 1 segundo entre las solicitudes de geolocalización para cumplir con la política de uso del servicio de geocodificación.

3. Luego creamos un objeto geolocalizador utilizando el geocodificador Nominatim de la biblioteca geopy.

4. Finalmente, aplicamos la función fill_anomalous_latlon a cada fila del DataFrame usando el método apply con axis=1, lo que indica que la función debe aplicarse por filas.

## Función con geopy para limpiar coordenadas

## Tests de verificación de limpieza de coordenadas

In [63]:
df_earthquake[['Latitude', 'Longitude', 'Location', 'Country']][700:800:]

Unnamed: 0,Latitude,Longitude,Location,Country
5766,23.08,80.04,Jabalpur (Madhya Pradesh),India
5779,37.66,57.29,"Bojnurd, Shirvan area (Khorasan)",Iran (Islamic Republic of)
5780,38.08,48.05,Ardabil region,Iran (Islamic Republic of)
5781,33.94,71.87,Birjand-Qayen area,Iran (Islamic Republic of)
5786,43.00,13.00,"Umbria, Marche regions",Italy
...,...,...,...,...
7209,40.71,30.04,"Kocaeli, Bursa, Istanbul, Sakarya, Yalova Prov...",Turkey
7210,,,Marmaris,Turkey
7211,40.71,29.95,Izmit,Turkey
7212,40.76,31.16,Sakarya Province,Turkey


In [64]:
# Filtrar DataFrame para detectar valores anómalos de latitud y longitud
df_eqanomalous = df_earthquake[
    df_earthquake['Latitude'].isin(anomalous_lat) | df_earthquake['Longitude'].isin(anomalous_lon)
]

# Crear pivot table para comparar las columnas de latitude, longitude, location, y country
df_eqanomalous_pivot = df_eqanomalous.pivot_table(
    index=['Location', 'Country'],
    values=['Latitude', 'Longitude'],
    aggfunc='first'
)

# Mostrar la pivot table
print(df_eqanomalous_pivot)


Empty DataFrame
Columns: []
Index: []


In [65]:
print(df_earthquake['Latitude'])
print(df_earthquake['Longitude'])

2        14.00
9        32.04
10       33.05
11        1.51
15       38.50
         ...  
16610    36.16
16611      NaN
16614    39.38
16616    36.16
16617      NaN
Name: Latitude, Length: 1597, dtype: float64
2       -91.00
9        76.16
10       71.40
11      -78.46
15       69.90
         ...  
16610    36.03
16611      NaN
16614    69.88
16616    36.03
16617      NaN
Name: Longitude, Length: 1597, dtype: float64


In [66]:
print(df_earthquake['Latitude'].dtype)
print(df_earthquake['Longitude'].dtype)

float64
float64


In [67]:
# # Examinar filas específicas para verificar si el proceso de limpieza ha manejado los valores anómalos correctamente.
# print(df_earthquake.loc[6000, 'Latitude'])
# print(df_earthquake.loc[6000, 'Longitude'])

In [68]:
# Valores nulos de Longitude y Latitude
df_earthquake[['Longitude', 'Latitude']].isnull().sum()

Longitude    65
Latitude     53
dtype: int64

## Valores Anómalos de Latitud y Longitud

In [69]:
len(anomalous_lat)

31

In [70]:
len(anomalous_lon)

31

#### Índices Anómalos de Latitud y longitud

In [71]:
anomalous_lat_idx = []
for index, lat in enumerate(anomalous_lat):
    if lat < -90 or lat > 90:
        anomalous_lat_idx.append(index)

print("Índices Anómalos de Latitud:", anomalous_lat_idx)

Índices Anómalos de Latitud: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [72]:
anomalous_lon_idx = []
for index, lon in enumerate(anomalous_lon):
    if lat < -90 or lat > 90:
        anomalous_lon_idx.append(index)

print("Índices Anómalos de Longitud:", anomalous_lon_idx)

Índices Anómalos de Longitud: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


## Outliers de Latitude y Longitude

In [73]:
# sns.boxplot(df_earthquake['Latitude'])

In [74]:
# sns.boxplot(df_earthquake['Longitude'])

## Valores faltantes

In [75]:
df_earthquake.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1597 entries, 2 to 16617
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dis No             1597 non-null   object 
 1   Year               1597 non-null   int64  
 2   Seq                1597 non-null   int64  
 3   Disaster Subgroup  1597 non-null   object 
 4   Disaster Type      1597 non-null   object 
 5   Disaster Subtype   1597 non-null   object 
 6   Country            1597 non-null   object 
 7   ISO                1597 non-null   object 
 8   Region             1597 non-null   object 
 9   Continent          1597 non-null   object 
 10  Location           1557 non-null   object 
 11  Origin             21 non-null     object 
 12  Associated Dis     342 non-null    object 
 13  Dis Mag Value      1597 non-null   int64  
 14  Dis Mag Scale      1597 non-null   object 
 15  Latitude           1544 non-null   float64
 16  Longitude          1532

In [76]:
df_earthquake.isnull().sum()

Dis No                  0
Year                    0
Seq                     0
Disaster Subgroup       0
Disaster Type           0
Disaster Subtype        0
Country                 0
ISO                     0
Region                  0
Continent               0
Location               40
Origin               1576
Associated Dis       1255
Dis Mag Value           0
Dis Mag Scale           0
Latitude               53
Longitude              65
Start Year              0
Start Month             0
Start Day               0
End Year                0
End Month               0
End Day                 0
Total Deaths            0
Total Affected          0
Total Damages Adj       0
dtype: int64

# Visualizaciones

## Heatmaps

# Pivot Tables

In [77]:
df_earthquake_origin_geolocation_pivot = df_earthquake.pivot_table(
    index='Origin',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_earthquake_origin_geolocation_pivot

  df_earthquake_origin_geolocation_pivot = df_earthquake.pivot_table(


Unnamed: 0_level_0,Latitude,Longitude
Disaster Subtype,Tsunami,Tsunami
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2
Earthquake,262.81,1729.1
Earthquake and landslide,8.8,123.5
Landslide,43.7,7.25
Sub-marine volcano,-45.24,-72.65
Volacno and landslide,8.3,121.7


In [79]:
df_earthquake_year_geolocation_pivot = df_earthquake.pivot_table(
    index='Year',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_earthquake_year_geolocation_pivot

  df_earthquake_year_geolocation_pivot = df_earthquake.pivot_table(


Unnamed: 0_level_0,Latitude,Latitude,Longitude,Longitude
Disaster Subtype,Ground movement,Tsunami,Ground movement,Tsunami
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1900,40.30,0.0,43.10,0.0
1901,0.00,40.5,0.00,142.5
1902,135.38,0.0,106.10,0.0
1903,186.49,0.0,220.90,0.0
1904,84.75,0.0,267.80,0.0
...,...,...,...,...
2019,445.31,0.0,1807.60,0.0
2020,454.61,0.0,429.87,0.0
2021,382.78,0.0,1159.24,0.0
2022,500.80,0.0,1243.64,0.0


# Guardar dataset Earthquake limpio en csv

In [None]:
df.to_csv('../../Data/03Limpio/earthquakelimpio.csv', index=False, sep=';', encoding='utf-8')