# Limpiar Coordenadas para Storm

## Importar librerías

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import SimpleImputer
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from time import sleep

In [2]:
geolocator = Nominatim(user_agent="MozillaFirefox", timeout=10)

## Cargar base de datos

03desastreslimpio.csv

In [3]:
df = pd.read_csv('../../Data/03Limpio/03desastreslimpio.csv')
df.head(10)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
0,1900-9002-CPV,1900,9002,Climatological,Drought,Drought,Cabo Verde,CPV,Western Africa,Africa,...,,1900,0,0,1900,0,0,11000,0,0
1,1900-9001-IND,1900,9001,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,...,,1900,0,0,1900,0,0,1250000,0,0
2,1902-0012-GTM,1902,12,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,...,-91.0,1902,4,18,1902,4,18,2000,0,843726
3,1902-0003-GTM,1902,3,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,4,8,1902,4,8,1000,0,0
4,1902-0010-GTM,1902,10,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,10,24,1902,10,24,6000,0,0
5,1903-0006-CAN,1903,6,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1903,4,29,1903,4,29,76,23,0
6,1903-0012-COM,1903,12,Geophysical,Volcanic activity,Ash fall,Comoros (the),COM,Eastern Africa,Africa,...,,1903,0,0,1903,0,0,17,0,0
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,,1904,11,0,1904,11,0,0,0,0
8,1905-0005-CAN,1905,5,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1905,8,13,1905,8,13,18,18,0
9,1905-0003-IND,1905,3,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,...,76.16,1905,4,4,1905,4,4,20000,0,812477


## EDA Inicial Básico

In [4]:
df.shape

(16636, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16636 entries, 0 to 16635
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Dis No             16636 non-null  object
 1   Year               16636 non-null  int64 
 2   Seq                16636 non-null  int64 
 3   Disaster Subgroup  16636 non-null  object
 4   Disaster Type      16636 non-null  object
 5   Disaster Subtype   13313 non-null  object
 6   Country            16636 non-null  object
 7   ISO                16636 non-null  object
 8   Region             16636 non-null  object
 9   Continent          16636 non-null  object
 10  Location           14825 non-null  object
 11  Origin             4085 non-null   object
 12  Associated Dis     3593 non-null   object
 13  Dis Mag Value      16636 non-null  int64 
 14  Dis Mag Scale      15416 non-null  object
 15  Latitude           2775 non-null   object
 16  Longitude          2775 non-null   objec

## Filtro Disaster Type == Storm

In [6]:
filter = df['Disaster Type'] == 'Storm'
df_storm = df[filter]

In [7]:
df_storm.head(2)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,,1904,11,0,1904,11,0,0,0,0
14,1906-0015-HKG,1906,15,Meteorological,Storm,Tropical cyclone,Hong Kong,HKG,Eastern Asia,Asia,...,,1906,9,8,1906,9,8,10000,0,649981


In [8]:
df_storm.shape

(4618, 26)

In [9]:
df_storm['Disaster Type'].dtype

dtype('O')

In [34]:
df_storm['Latitude'].isnull().sum()

0

In [35]:
df_storm['Longitude'].isnull().sum()

0

# Limpieza de coordenadas

## Función para limpiar coordenadas filtrado para df_storm

In [10]:
# Initializar geolocator
geolocator = Nominatim(user_agent="eqlimpiarcoord")

# Convertir 'Latitude' y 'Longitude' a string
df_storm['Latitude'] = df_storm['Latitude'].astype(str)
df_storm['Longitude'] = df_storm['Longitude'].astype(str)

# Limpiar coordenadas de latitude y longitude
df_storm['Latitude'] = df_storm['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
df_storm['Longitude'] = df_storm['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x))

# Remover puntos finales
df_storm['Latitude'] = df_storm['Latitude'].str.rstrip('.')
df_storm['Longitude'] = df_storm['Longitude'].str.rstrip('.')

# Añadir valores anómalos de latitud y longitud a listas
anomalous_lat = []
anomalous_lon = []

# Función para convertir coordenadas y detectar anomalías
def convert_coordinates(x, convert_nan=True):
    if not x:
        return np.nan

    # Remover caracteres 'N' y 'E'
    x = x.replace(' N', '').replace(' E', '')

    # Remover puntos extra en decimales
    x = re.sub('^(\d+\.\d{2})\..*', r'\1', x)

    # Conversión a negativo para S y W
    try:
        value = float(x)
        if x[-1] == 'S' or x[-1] == 'W':
            return -value
        else:
            return value
    except ValueError:
        return np.nan

# Convertir a float
df_storm['Latitude'] = df_storm['Latitude'].apply(convert_coordinates)
df_storm['Longitude'] = df_storm['Longitude'].apply(convert_coordinates)

# Redondear decimales
df_storm['Latitude'] = df_storm['Latitude'].round(2)
df_storm['Longitude'] = df_storm['Longitude'].round(2)

# Identificar valores anómalos de latitude y longitude
for index, row in df_storm.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']

    if latitude < -90 or latitude > 90:
        anomalous_lat.append(index)

    if longitude < -180 or longitude > 180:
        anomalous_lon.append(index)

# Revisar inconsistencias de cantidad entre anómalos de latitude y longitude
if len(anomalous_lat) != len(anomalous_lon):
    print("Valores inconsistentes de latitud y longitud.")
    # Gestionar la inconsistencia, como remover los valores extras o ajustar las listas
    # Por ejemplo, remover los valores extra de latitud:
    anomalous_lat = anomalous_lat[:len(anomalous_lon)]
    print("Longitud de latitud anómala ajustada:", len(anomalous_lat))

# Crear nuevo DataFrame con coordenadas limpias y emparejadas
df_eqcleaned = df_storm.copy()

# Función para rellenar coordenadas anómalas usando geocoding
def fill_anomalous_coordinates(row):
    if row.name in anomalous_lat and row.name in anomalous_lon:
        try:
            location = geolocator.reverse((row['Latitude'], row['Longitude']), timeout=10)
            if location and location.latitude is not None and location.longitude is not None:
                row['Latitude'] = location.latitude
                row['Longitude'] = location.longitude
                row['Location'] = location.address
                row['Country'] = location.raw['address'].get('country')
        except:
            pass
    return row

# Rellenar coordenadas anómalas
df_eqcleaned = df_eqcleaned.apply(fill_anomalous_coordinates, axis=1)

# Borrar filas con location y country no emparejadas
df_eqcleaned = df_eqcleaned[~((df_eqcleaned['Latitude'].isin(anomalous_lat)) & (df_eqcleaned['Longitude'].isin(anomalous_lon)))]

# Borrar filas con valores null o nan en columnas de Latitud y Longitud
df_eqcleaned = df_eqcleaned.dropna(subset=['Latitude', 'Longitude'])

# Convertir Latitud y Longitud a float con 2 decimales
df_eqcleaned['Latitude'] = df_eqcleaned['Latitude'].astype(float).round(2)
df_eqcleaned['Longitude'] = df_eqcleaned['Longitude'].astype(float).round(2)

# # Exportar el DataFrame limpio a archivo
# df_eqcleaned.to_csv('stormfilteredcleanedgeoloc.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_storm['Latitude'] = df_storm['Latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_storm['Longitude'] = df_storm['Longitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_storm['Latitude'] = df_storm['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
A val

Valores inconsistentes de latitud y longitud.
Longitud de latitud anómala ajustada: 0


1. Definimos la función fill_anomalous_latlon que toma una fila como entrada. Si los valores de latitud y longitud de la fila se encuentran en las listas anomalous_lat y anomalous_lon, respectivamente, la función intenta geocodificar la ubicación mediante la columna Location o la columna Country. Si se encuentra una ubicación válida, se actualizan los valores de latitud y longitud de la fila.

2. La llamada a la función sleep(1) agrega un retraso de 1 segundo entre las solicitudes de geolocalización para cumplir con la política de uso del servicio de geocodificación.

3. Luego creamos un objeto geolocalizador utilizando el geocodificador Nominatim de la biblioteca geopy.

4. Finalmente, aplicamos la función fill_anomalous_latlon a cada fila del DataFrame usando el método apply con axis=1, lo que indica que la función debe aplicarse por filas.

## Función con geopy para limpiar coordenadas

## Tests de verificación de limpieza de coordenadas

In [11]:
df_storm[['Latitude', 'Longitude', 'Location', 'Country']][700:800:]

Unnamed: 0,Latitude,Longitude,Location,Country
1959,,,"Efate, Shepherds",Vanuatu
1962,,,Jonesboro (Arkansas),United States of America (the)
1965,,,,United States of America (the)
1966,,,Binh Dinh,Viet Nam
1968,,,,United States of America (the)
...,...,...,...,...
2288,,,,Czechoslovakia
2289,,,East Berlin,Germany Dem Rep
2290,,,,Germany Dem Rep
2291,,,,Germany Fed Rep


In [12]:
# Filtrar DataFrame para detectar valores anómalos de latitud y longitud
df_eqanomalous = df_storm[
    df_storm['Latitude'].isin(anomalous_lat) | df_storm['Longitude'].isin(anomalous_lon)
]

# Crear pivot table para comparar las columnas de latitude, longitude, location, y country
df_eqanomalous_pivot = df_eqanomalous.pivot_table(
    index=['Location', 'Country'],
    values=['Latitude', 'Longitude'],
    aggfunc='first'
)

# Mostrar la pivot table
print(df_eqanomalous_pivot)


Empty DataFrame
Columns: []
Index: []


In [13]:
print(df_storm['Latitude'])
print(df_storm['Longitude'])

7       NaN
14      NaN
18      NaN
19      NaN
22      NaN
         ..
16624   NaN
16625   NaN
16626   NaN
16627   NaN
16633   NaN
Name: Latitude, Length: 4618, dtype: float64
7       NaN
14      NaN
18      NaN
19      NaN
22      NaN
         ..
16624   NaN
16625   NaN
16626   NaN
16627   NaN
16633   NaN
Name: Longitude, Length: 4618, dtype: float64


In [14]:
print(df_storm['Latitude'].dtype)
print(df_storm['Longitude'].dtype)

float64
float64


In [15]:
# # Examinar filas específicas para verificar si el proceso de limpieza ha manejado los valores anómalos correctamente.
# print(df_storm.loc[6000, 'Latitude'])
# print(df_storm.loc[6000, 'Longitude'])

In [16]:
# Valores nulos de Longitude y Latitude
df_storm[['Longitude', 'Latitude']].isnull().sum()

Longitude    4504
Latitude     4504
dtype: int64

## Valores Anómalos de Latitud y Longitud

In [17]:
len(anomalous_lat)

0

In [18]:
len(anomalous_lon)

0

#### Índices Anómalos de Latitud y longitud

In [19]:
anomalous_lat_idx = []
for index, lat in enumerate(anomalous_lat):
    if lat < -90 or lat > 90:
        anomalous_lat_idx.append(index)

print("Índices Anómalos de Latitud:", anomalous_lat_idx)

Índices Anómalos de Latitud: []


In [20]:
anomalous_lon_idx = []
for index, lon in enumerate(anomalous_lon):
    if lat < -90 or lat > 90:
        anomalous_lon_idx.append(index)

print("Índices Anómalos de Longitud:", anomalous_lon_idx)

Índices Anómalos de Longitud: []


## Outliers de Latitude y Longitude

In [21]:
# sns.boxplot(df_storm['Latitude'])

In [22]:
# sns.boxplot(df_storm['Longitude'])

## Valores faltantes

In [23]:
df_storm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4618 entries, 7 to 16633
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dis No             4618 non-null   object 
 1   Year               4618 non-null   int64  
 2   Seq                4618 non-null   int64  
 3   Disaster Subgroup  4618 non-null   object 
 4   Disaster Type      4618 non-null   object 
 5   Disaster Subtype   3803 non-null   object 
 6   Country            4618 non-null   object 
 7   ISO                4618 non-null   object 
 8   Region             4618 non-null   object 
 9   Continent          4618 non-null   object 
 10  Location           3882 non-null   object 
 11  Origin             48 non-null     object 
 12  Associated Dis     1345 non-null   object 
 13  Dis Mag Value      4618 non-null   int64  
 14  Dis Mag Scale      4618 non-null   object 
 15  Latitude           114 non-null    float64
 16  Longitude          114 

In [24]:
df_storm.isnull().sum()

Dis No                  0
Year                    0
Seq                     0
Disaster Subgroup       0
Disaster Type           0
Disaster Subtype      815
Country                 0
ISO                     0
Region                  0
Continent               0
Location              736
Origin               4570
Associated Dis       3273
Dis Mag Value           0
Dis Mag Scale           0
Latitude             4504
Longitude            4504
Start Year              0
Start Month             0
Start Day               0
End Year                0
End Month               0
End Day                 0
Total Deaths            0
Total Affected          0
Total Damages Adj       0
dtype: int64

## Imputar valores anómalos con SimpleImputer

In [25]:
imputer = SimpleImputer(strategy='most_frequent')

# Seleccionar filas donde 'Longitude' está (isin) la lista de anomalous_lon
anomalous_lat_to_impute = df_storm['Latitude'].isin(anomalous_lat)

# Imputar los valores con la media a anomalous_lon en 'Longitude'
df_storm.loc[anomalous_lat_to_impute, 'Latitude'] = imputer.fit_transform(df_storm[['Latitude']])[anomalous_lat_to_impute]
df_storm['Latitude'].isnull().sum()

4504

In [26]:
imputer = SimpleImputer(strategy='most_frequent')

# Seleccionar filas donde 'Longitude' está (isin) la lista de anomalous_lon
anomalous_lon_to_impute = df_storm['Longitude'].isin(anomalous_lon)

# Imputar los valores anómalos con la media a anomalous_lon en 'Longitude'
df_storm.loc[anomalous_lon_to_impute, 'Longitude'] = imputer.fit_transform(df_storm[['Longitude']])[anomalous_lon_to_impute]
df_storm['Longitude'].isnull()

7        True
14       True
18       True
19       True
22       True
         ... 
16624    True
16625    True
16626    True
16627    True
16633    True
Name: Longitude, Length: 4618, dtype: bool

Imputar mediana a nulos con SimpleImputer a Longitude y Latitude

In [27]:
imputer = SimpleImputer(strategy='median')
# Seleccionar filas donde 'Longitude' sea null (NaN)
null_lon_rows = df_storm['Latitude'].isnull()
# Imputar los nulls en 'Longitude' estrategia mean
df_storm.loc[null_lon_rows, 'Latitude'] = imputer.fit_transform(df_storm[['Latitude']])[null_lon_rows]
df_storm['Latitude'].isnull().sum()

0

In [28]:
imputer = SimpleImputer(strategy='median')
# Seleccionar filas donde 'Longitude' sea null (NaN)
null_lon_rows = df_storm['Longitude'].isnull()
# Imputar los nulls en 'Longitude' estrategia mean
df_storm.loc[null_lon_rows, 'Longitude'] = imputer.fit_transform(df_storm[['Longitude']])[null_lon_rows]
df_storm['Longitude'].isnull().sum()

0

In [29]:
df_storm.head()

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,104.125,1904,11,0,1904,11,0,0,0,0
14,1906-0015-HKG,1906,15,Meteorological,Storm,Tropical cyclone,Hong Kong,HKG,Eastern Asia,Asia,...,104.125,1906,9,8,1906,9,8,10000,0,649981
18,1909-0010-BGD,1909,10,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,104.125,1909,10,15,1909,10,15,172,0,0
19,1909-0013-BGD,1909,13,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,104.125,1909,12,0,1909,12,0,0,0,0
22,1909-0012-HTI,1909,12,Meteorological,Storm,Tropical cyclone,Haiti,HTI,Caribbean,Americas,...,104.125,1909,11,12,1909,11,12,150,0,0


# Pivot Tables

In [30]:
df_storm_origin_geolocation_pivot = df_storm.pivot_table(
    index='Origin',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_storm_origin_geolocation_pivot

  df_storm_origin_geolocation_pivot = df_storm.pivot_table(


Unnamed: 0_level_0,Latitude,Latitude,Longitude,Longitude
Disaster Subtype,Convective storm,Tropical cyclone,Convective storm,Tropical cyclone
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Arctic blast associated with the Polar Vortex,19.73,0.0,104.125,0.0
Brief torrential rains,19.73,0.0,104.125,0.0
"Cold front meeting warm, moist air",19.73,0.0,104.125,0.0
Days of rains and hails,19.73,0.0,104.125,0.0
El Nino phenomenom,0.0,19.73,0.0,104.125
Formed by the remnants of tropical cyclone GULAB,0.0,19.73,0.0,104.125
Heavy monsoonal rain,19.73,0.0,104.125,0.0
Heavy rain,19.73,54.0,104.125,239.82
Heavy rains,114.99,19.73,329.22,104.125
Heavy rains and strong winds,19.73,0.0,104.125,0.0


In [31]:
df_storm_year_geolocation_pivot = df_storm.pivot_table(
    index='Year',
    columns='Disaster Subtype',
    values=['Latitude', 'Longitude', 'Location'],
    aggfunc='sum',
    fill_value=0
)
df_storm_year_geolocation_pivot

  df_storm_year_geolocation_pivot = df_storm.pivot_table(


Unnamed: 0_level_0,Latitude,Latitude,Latitude,Longitude,Longitude,Longitude
Disaster Subtype,Convective storm,Extra-tropical storm,Tropical cyclone,Convective storm,Extra-tropical storm,Tropical cyclone
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1900,0.00,0.00,19.73,0.000,0.000,104.125
1902,0.00,0.00,19.73,0.000,0.000,104.125
1903,19.73,0.00,19.73,104.125,0.000,104.125
1904,0.00,0.00,19.73,0.000,0.000,104.125
1905,0.00,0.00,19.73,0.000,0.000,104.125
...,...,...,...,...,...,...
2019,789.20,118.38,868.12,4165.000,624.750,4581.500
2020,690.55,277.18,1524.45,3644.375,1458.805,8170.520
2021,1144.34,19.73,1144.34,6039.250,104.125,6039.250
2022,808.93,335.41,986.50,4269.125,1770.125,5206.250


# Guardar dataset Storm limpio en csv

In [32]:
df.to_csv('../../Data/03Limpio/StormLimpioCSV/03desastres_3stormcoordlimpias.csv')

OSError: Cannot save file into a non-existent directory: '..\..\Data\03Limpio\StormLimpioCSV'