# Impacto humano de Inundaciones (Flood)

## Importar librerías

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import re
from geopy.geocoders import Nominatim

## Cargar base de datos

02desastres_paralimpiar.csv

In [51]:
df = pd.read_csv('../../Data/02ParaLimpiar/02desastres_paralimpiar.csv', delimiter=';', encoding='utf-8')
df.head(10)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
0,1900-9002-CPV,1900,9002,Climatological,Drought,Drought,Cabo Verde,CPV,Western Africa,Africa,...,,1900,,,1900,,,11000.0,,
1,1900-9001-IND,1900,9001,Climatological,Drought,Drought,India,IND,Southern Asia,Asia,...,,1900,,,1900,,,1250000.0,,
2,1902-0012-GTM,1902,12,Geophysical,Earthquake,Ground movement,Guatemala,GTM,Central America,Americas,...,-91.0,1902,4.0,18.0,1902,4.0,18.0,2000.0,,843726.0
3,1902-0003-GTM,1902,3,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,4.0,8.0,1902,4.0,8.0,1000.0,,
4,1902-0010-GTM,1902,10,Geophysical,Volcanic activity,Ash fall,Guatemala,GTM,Central America,Americas,...,,1902,10.0,24.0,1902,10.0,24.0,6000.0,,
5,1903-0006-CAN,1903,6,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1903,4.0,29.0,1903,4.0,29.0,76.0,23.0,
6,1903-0012-COM,1903,12,Geophysical,Volcanic activity,Ash fall,Comoros (the),COM,Eastern Africa,Africa,...,,1903,,,1903,,,17.0,,
7,1904-0003-BGD,1904,3,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,...,,1904,11.0,,1904,11.0,,,,
8,1905-0005-CAN,1905,5,Geophysical,Mass movement (dry),Rockfall,Canada,CAN,Northern America,Americas,...,,1905,8.0,13.0,1905,8.0,13.0,18.0,18.0,
9,1905-0003-IND,1905,3,Geophysical,Earthquake,Ground movement,India,IND,Southern Asia,Asia,...,76.16,1905,4.0,4.0,1905,4.0,4.0,20000.0,,812477.0


## EDA Inicial Básico

In [52]:
df.shape

(16636, 26)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16636 entries, 0 to 16635
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dis No             16636 non-null  object 
 1   Year               16636 non-null  int64  
 2   Seq                16636 non-null  int64  
 3   Disaster Subgroup  16636 non-null  object 
 4   Disaster Type      16636 non-null  object 
 5   Disaster Subtype   13313 non-null  object 
 6   Country            16636 non-null  object 
 7   ISO                16636 non-null  object 
 8   Region             16636 non-null  object 
 9   Continent          16636 non-null  object 
 10  Location           14825 non-null  object 
 11  Origin             4085 non-null   object 
 12  Associated Dis     3593 non-null   object 
 13  Dis Mag Value      5064 non-null   float64
 14  Dis Mag Scale      15416 non-null  object 
 15  Latitude           2775 non-null   object 
 16  Longitude          277

## Filtro Disaster Type == Flood

In [54]:
filter = df['Disaster Type'] == 'Flood'
df_flood = df[filter]

In [55]:
df_flood.head(2)

Unnamed: 0,Dis No,Year,Seq,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,...,Longitude,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,Total Affected,Total Damages Adj
12,1906-0023-BEL,1906,23,Hydrological,Flood,,Belgium,BEL,Western Europe,Europe,...,,1906,5.0,14.0,1906,5.0,14.0,6.0,,
13,1906-0024-BEL,1906,24,Hydrological,Flood,,Belgium,BEL,Western Europe,Europe,...,,1906,4.0,,1906,4.0,,,,


In [56]:
df_flood.shape

(5808, 26)

In [57]:
df_flood['Disaster Type'].dtype

dtype('O')

# Limpieza de coordenadas

## Función para limpiar coordenadas filtrado para df_flood

In [59]:
# Convertir 'Latitude' y 'Longitude' a string
df['Latitude'] = df['Latitude'].astype(str)
df['Longitude'] = df['Longitude'].astype(str)

# Limpiar las coordenadas de latitud y longitud
df['Latitude'] = df['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
df['Longitude'] = df['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x))

# Quitar puntos finales
df['Latitude'] = df['Latitude'].str.rstrip('.')
df['Longitude'] = df['Longitude'].str.rstrip('.')

# Añadir valores anómalos de latitud y longitud a listas
anomalous_lat = []
anomalous_lon = []

def convert_coordinates(x, convert_nan=True):
    if not x:
        return np.nan

    # Remover caracteres 'N' y 'E'
    x = x.replace(' N', '').replace(' E', '')
    
    # Remover puntos extras en decimales
    x = re.sub('^(\d+\.\d{2})\..*', r'\1', x)

    # Conversión a negativo para S y W
    try:
        value = float(x)
        if x[-1] == 'S' or x[-1] == 'W':
            return -value
        else:
            return value
    except ValueError:
        return np.nan

# Convertir a float
df['Latitude'] = df['Latitude'].apply(convert_coordinates)
df['Longitude'] = df['Longitude'].apply(convert_coordinates)

# Redondear decimales
df['Latitude'] = df['Latitude'].round(2)
df['Longitude'] = df['Longitude'].round(2)

# Identificación de valores anómalos fuera de rango de grados de sistema de coordenadas
for index, row in df.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']

    if latitude < -90 or latitude > 90:
        anomalous_lat.append(latitude)

    if longitude < -180 or longitude > 180:
        anomalous_lon.append(longitude)

print("Valores anómalos de Latitude:", anomalous_lat)
print("Valores anómalos de Longitude:", anomalous_lon)


Valores anómalos de Latitude: [357.0, 411.44, 564.78, -865.05, 270.63, 350.75, 388.5, 151.82, 940.04, 193.8, -295.02, -405.21, 125.95, 289.08, 213.52, 334.55, 420.68, 194.43, 301.14, 342.7, 370.53, -104.9, 465.99, -528.26, 398.68, 191.36, 236.39, -189.54, 282.42, 227.7, 369.29, -344.81, 445.99, 304.56, 359.16, 258.63, 229.04, 491.36, 532.73, 146.06, -617.95, 448.01, -133.89, 439.96, 431.6, 165.92, 822.39, -140.1, 205.95, 240.59, -183.1, 246.17, 163.19, 396.15, 166.08, 759.78, 420.41, 183.12, -248.01, -202.3, -156.18, 358.44, 150.1, 271.82, 287.4, -158.38, -203.46, 98.8, 500.04, 443.29, 136.88, 216.96, 439.96, 303.63, 289.64, 139.19, 382.14, 388.28, 375.6, -164.64, -414.04, 271.89, -383.65, 440.62, 359.05, 250.0, 233.83, 303.4, 233.58, 368.56, 382.08, 402.89, -881.66, -223.62, 270.33, 101.74, 420.68, -315.73, -255.69, 570.69, 365.24, 364.94, -121.14, -351.09, 282.3, -115.04, -697.8, -441.03, 282.3, 278.09, 374.59, 394.74, 738.0, 414.57, 666.58, 730.95, 271.5, 325.83, 366.41, 484.05, -25

In [None]:
# # Convertir 'Latitude' y 'Longitude' a string
# df['Latitude'] = df['Latitude'].astype(str)
# df['Longitude'] = df['Longitude'].astype(str)

# # Limpiar las coordinadas de latitude y longitude
# df['Latitude'] = df['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x))
# df['Longitude'] = df['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x))

# # Quitar puntos finales
# df['Latitude'] = df['Latitude'].str.rstrip('.')
# df['Longitude'] = df['Longitude'].str.rstrip('.')

# # # Reemplazar Norte y Este con valores positivos, Sur y Oeste con negativos
# # df['Latitude'] = df['Latitude'].apply(lambda x: float(x) if x and x[-1] != 'S' else -float(x[:-1]) if x else np.nan)
# # df['Longitude'] = df['Longitude'].apply(lambda x: float(x) if x and x[-1] != 'W' else -float(x[:-1]) if x else np.nan)

# # Función para convertir latitude y longitude
# def convert_coordinates(x):
#     if not x:
#         return np.nan
    
#     # Remover los caracteres 'N' y 'E'
#     x = x.replace(' N', '').replace(' E', '')
    
#     # Remover puntos extras en decimales
#     x = re.sub('(?<=\d)\.(?=.*\.)', '', x) # Este revisa antes del primer punto
#     # x = re.sub('\.(?=.*\.)', '', x)
    
#     try:
#         value = float(x)
#         if x[-1] == 'S' or x[-1] == 'W':
#             return -value
#         else:
#             return value
#     except ValueError:
#         return np.nan

# # Convertir Latitude y Longitude a coordenadas
# df['Latitude'] = df['Latitude'].apply(convert_coordinates)
# df['Longitude'] = df['Longitude'].apply(convert_coordinates)

# # Redondear decimales a 2 dígitos
# df['Latitude'] = df['Latitude'].round(2)
# df['Longitude'] = df['Longitude'].round(2)

# # Identificar valores anómalos a partir de los grados,
# # que no representen coordenadas: los que no estén dentro de 90 y -90 ni 180 y -180
# anomalous_lat = ~df['Latitude'].between(-90, 90)
# anomalous_lon = ~df['Longitude'].between(-180, 180)

# # Configurar valores anómalos a NaN
# # df.loc[anomalous_lat, 'Latitude'] = np.nan
# # df.loc[anomalous_lon, 'Longitude'] = np.nan


In [None]:
# # Crear listas vacías para almacenar valores anómalos de latitude y longitude
# anomalous_lat = []
# anomalous_lon = []

# # Iterar sobre las filas e identificar valores anómalos
# for index, row in df_flood.iterrows():
#     latitude = row['Latitude']
#     longitude = row['Longitude']
    
#     # Revisar si latitude y longitude son números válidos
#     if pd.notnull(latitude) and pd.notnull(longitude):
#         # Convertir latitude y longitude a tipo numérico si hay strings
#         if isinstance(latitude, str):
#             latitude = pd.to_numeric(latitude, errors='coerce')
#         if isinstance(longitude, str):
#             longitude = pd.to_numeric(longitude, errors='coerce')
        
#         # Revisar si latitude está fuera de rango (-90 to 90)
#         if latitude < -90 or latitude > 90:
#             anomalous_lat.append(latitude)
        
#         # Revisar si longitude está fuera de rango (-180 a 180)
#         if longitude < -180 or longitude > 180:
#             anomalous_lon.append(longitude)

# # Mostrar valores anómalos de latitude y longitude
# print("Valores Anómalos de Latitude:", anomalous_lat)
# print("Valores Anómalos de Longitude:", anomalous_lon)


In [None]:
# # Convertir 'Latitude' y 'Longitude' a string
# df['Latitude'] = df['Latitude'].astype(str)
# df['Longitude'] = df['Longitude'].astype(str)

# # Limpiar las coordenadas latitude y longitude
# df['Latitude'] = df['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)
# df['Longitude'] = df['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)

# # Eliminar puntos finales
# df['Latitude'] = df['Latitude'].str.rstrip('.')
# df['Longitude'] = df['Longitude'].str.rstrip('.')

# # Cambiar strings vacíos a NaN
# df['Latitude'] = df['Latitude'].replace('', np.nan)
# df['Longitude'] = df['Longitude'].replace('', np.nan)

# # Convertir coordenadas válidas a float
# df['Latitude'] = df['Latitude'].astype(float)
# df['Longitude'] = df['Longitude'].astype(float)

# # Identificar valores anómalos a partir de los grados
# anomalous_lat = df['Latitude'].abs() > 90
# anomalous_lon = df['Longitude'].abs() > 180

# # Configurar valores anómalos a NaN
# df.loc[anomalous_lat, 'Latitude'] = np.nan
# df.loc[anomalous_lon, 'Longitude'] = np.nan

In [None]:
# df_flood = df[(df['Disaster Subtype'] == 'Flood') & (df['Total Deaths'].notna())]

# # Limpiar coordenadas latitude y longitude para df_flood
# df_flood['Latitude'] = df_flood['Latitude'].apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)
# df_flood['Longitude'] = df_flood['Longitude'].apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)

# # Borrar puntos finales
# df_flood['Latitude'] = df_flood['Latitude'].str.rstrip('.')
# df_flood['Longitude'] = df_flood['Longitude'].str.rstrip('.')

# # Cambiar strings vacíos a NaN
# df_flood['Latitude'] = df_flood['Latitude'].replace('', np.nan)
# df_flood['Longitude'] = df_flood['Longitude'].replace('', np.nan)

# # Convertir coordenadas válidas a float
# df_flood['Latitude'] = df_flood['Latitude'].astype(float)
# df_flood['Longitude'] = df_flood['Longitude'].astype(float)

# # Remover filas con coordenadas NaN
# df_flood = df_flood.dropna(subset=['Latitude', 'Longitude'])


In [None]:
# # Filtrar el df_flood DataFrame para Disaster Subtype 'Flood' con non-null values en columnas específicas
# df_flood = df[(df['Disaster Subtype'] == 'Flood') & (df['Total Affected'].notna()) & (df['Total Deaths'].notna()) & (df['Dis Mag Scale'].notna()) & (df['Dis Mag Value'].notna())]

# # Limpiar y convertir la columna 'Latitude'
# df_flood['Latitude'] = df_flood['Latitude'].astype(str).apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)
# df_flood['Latitude'] = df_flood['Latitude'].str.rstrip('.')
# df_flood['Latitude'] = df_flood['Latitude'].replace('', np.nan)
# df_flood['Latitude'] = df_flood['Latitude'].astype(float)

# # Limpiar y convertir la columna 'Longitude'
# df_flood['Longitude'] = df_flood['Longitude'].astype(str).apply(lambda x: re.sub('[^\d.-]', '', x.split('.', 1)[0]) if isinstance(x, str) else x)
# df_flood['Longitude'] = df_flood['Longitude'].str.rstrip('.')
# df_flood['Longitude'] = df_flood['Longitude'].replace('', np.nan)
# df_flood['Longitude'] = df_flood['Longitude'].astype(float)

# # Remover filas con NaN en coordenadas, 'Total Deaths', y 'Total Affected'
# df_flood = df_flood.dropna(subset=['Latitude', 'Longitude', 'Total Deaths', 'Total Affected'])


## Tests de verificación de limpieza de coordenadas

In [60]:
print(df_flood['Latitude'])
print(df_flood['Longitude'])

12       NaN
13       NaN
34       NaN
39       NaN
43       NaN
        ... 
16620    NaN
16628    NaN
16629    NaN
16631    NaN
16634    NaN
Name: Latitude, Length: 5808, dtype: object
12       NaN
13       NaN
34       NaN
39       NaN
43       NaN
        ... 
16620    NaN
16628    NaN
16629    NaN
16631    NaN
16634    NaN
Name: Longitude, Length: 5808, dtype: object


In [61]:
print(df_flood['Latitude'].dtype)
print(df_flood['Longitude'].dtype)

object
object


In [63]:
# Examinar filas específicas para verificar si el proceso de limpieza ha manejado los valores anómalos correctamente.
print(df_flood.loc[12000, 'Latitude'])
print(df_flood.loc[12000, 'Longitude'])

-7.379
110.24


### Valores Anómalos de Latitud

In [64]:
len(anomalous_lat)

158

In [65]:
len(anomalous_lon)

31

### Índices Anómalos de Latitud

In [66]:
anomalous_lat_idx = []
for index, lat in enumerate(anomalous_lat):
    if lat < -90 or lat > 90:
        anomalous_lat_idx.append(index)

print("Índices Anómalos de Latitud:", anomalous_lat_idx)

Índices Anómalos de Latitud: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157]


In [67]:
anomalous_lon_idx = []
for index, lon in enumerate(anomalous_lon):
    if lat < -90 or lat > 90:
        anomalous_lon_idx.append(index)

print("Índices Anómalos de Longitud:", anomalous_lon_idx)

Índices Anómalos de Longitud: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


## Outliers de Latitude y Longitude

In [68]:
sns.boxplot(df_flood['Longitude'])

KeyError: 0

In [70]:
sns.boxplot(df_flood['Latitude'])

KeyError: 0

## Valores faltantes

In [None]:
df[['Longitude', 'Latitude']].isnull().sum()

# Afectados por Inundaciones

In [None]:
df_flood['Total Deaths'].sum()

In [None]:
df_flood['Total Affected'].sum()

## Nulos

In [None]:
df_flood.isnull().sum()

In [None]:
df_flood['Country'].value_counts()

In [None]:
df_flood.groupby('Country')['Total Deaths'].sum().to_frame()

In [None]:
total_deaths_by_continent = df_flood.groupby('Continent')['Total Deaths'].sum().to_frame()
total_deaths_by_continent

In [None]:
total_deaths_by_region = df_flood.groupby('Continent')['Total Deaths'].sum().to_frame()
total_deaths_by_region

In [None]:
total_deaths_by_country = df_flood.groupby('Country')['Total Deaths'].value_counts()
total_deaths_by_country

In [None]:
df_flood[['Longitude', 'Latitude', 'Country']].reset_index().head(50)

In [None]:
# plt.figure(figsize=(12,8), dpi=300)
# sns.swarmplot(data=df_flood, x='Longitude', y= 'Latitude', hue= 'Continent', palette='Paired', size='Total Deaths', sizes=(20,200))