# ETL 2: Transformación 1 - Limpieza

In [3]:
#importamos las librerías:

from IPython.core.interactiveshell import InteractiveShell # Nos permite mostar más de una salida por celda
InteractiveShell.ast_node_interactivity = "all" # Nos permite mostar más de una salida por celda

import requests
import pandas as pd
import numpy as np
import ast 

from datetime import datetime, timedelta

pd.options.display.max_columns=None

## Introducción:
Tendréis que usar el csv attacks_limpieza_completa que tenéis adjunto abajo.
En la lección de hoy aprendimos como transformar nuestros datos para que estén preparados para almacearlos en una BBDD. En este momento tenemos dos fuentes de datos:

- El csv con los ataques de tiburones que hemos estado limpiando hasta ahora, el que os hemos adjuntado (attacks_limpieza_completa). Sentiros libres de usar vuestros propios csv en caso de que queráis.
- El csv con los datos climáticos de los principales paises que tienen ataques de tiburones, el que creamos en el pair programming de ayer.

**El objetivo de la sesión de hoy será juntar en un único csv la información de ambas fuentes**. 

Para ello:
1. Cargaremos los dos ficheros de datos del dataframe de los ataques nos quedaremos solo con las filas de los países que seleccionamos en la lección de ayer:
    - USA
    - Australia
    - New Zealand
    - South Africa
    - Papua New Guinea  
  

In [4]:
#abrimos el csv creado en el pair anterior
clima = pd.read_csv('datos/ETL-1.csv', index_col = 0)
clima.head(2)

Unnamed: 0,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 230, 'speed':...",12,15,2,1029,none,0,0,270,2,USA
1,6,5,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 2}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 245, 'speed':...",12,15,1,1028,none,0,0,210,2,USA


In [5]:
#abrimos el csv generado en el pair de Limpieza 4
df = pd.read_csv('datos/Limpieza-4.csv', index_col=0)
df.head(2)

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F
1,2018,Unprovoked,brazil,Swimming,18.0,Tiger Shark,Jun,Y,M


### Nos quedamos con los países que nos piden:

In [6]:
list_country = ['usa', 'australia', 'south africa', 'new zealand', 'papua new guinea']

In [7]:
df_country = df[df['country'].isin(list_country)]
df_country.head()

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F
2,2018,Unprovoked,usa,Walking,15.0,Bull Shark,May,N,M
3,2018,Provoked,australia,Feeding sharks,32.0,Grey Shark,May,N,M
6,2018,Unprovoked,australia,Surfing,60.0,Unspecified,Apr,N,M
8,2018,Unprovoked,south africa,Paddle-skiing,33.0,White Shark,Apr,N,M


In [8]:
df_country['country'].unique()

array(['usa', 'australia', 'south africa', 'new zealand',
       'papua new guinea'], dtype=object)

### Desempaquetamos la columna 'hr_profile':

In [9]:
clima['rh_profile'] = clima['rh_profile'].apply(ast.literal_eval)

In [10]:
df_rh = clima['rh_profile'].apply(pd.Series)

In [11]:
df_rh.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'rh': 5}","{'layer': '900mb', 'rh': 7}","{'layer': '850mb', 'rh': 2}","{'layer': '800mb', 'rh': -2}","{'layer': '750mb', 'rh': 0}","{'layer': '700mb', 'rh': 2}","{'layer': '650mb', 'rh': 3}","{'layer': '600mb', 'rh': 2}","{'layer': '550mb', 'rh': -1}","{'layer': '500mb', 'rh': -1}","{'layer': '450mb', 'rh': -2}","{'layer': '400mb', 'rh': -2}","{'layer': '350mb', 'rh': 4}","{'layer': '300mb', 'rh': 5}","{'layer': '250mb', 'rh': 7}","{'layer': '200mb', 'rh': 4}"
1,"{'layer': '950mb', 'rh': 2}","{'layer': '900mb', 'rh': 0}","{'layer': '850mb', 'rh': -1}","{'layer': '800mb', 'rh': -1}","{'layer': '750mb', 'rh': 1}","{'layer': '700mb', 'rh': 2}","{'layer': '650mb', 'rh': 2}","{'layer': '600mb', 'rh': 2}","{'layer': '550mb', 'rh': -1}","{'layer': '500mb', 'rh': -1}","{'layer': '450mb', 'rh': 1}","{'layer': '400mb', 'rh': 6}","{'layer': '350mb', 'rh': 5}","{'layer': '300mb', 'rh': 5}","{'layer': '250mb', 'rh': 10}","{'layer': '200mb', 'rh': 5}"


In [12]:
#generamos un for para volcar como valores y nombres de columnas los diccionarios encerrados en el valor de las columnas
for i in range(len(df_rh.columns)):
    nombre_rh = "rh_"+ str(df_rh[i].apply(pd.Series)["layer"][0])
    valores_rh = (df_rh[i].apply(pd.Series)["rh"])

    clima.insert(i, nombre_rh, valores_rh)

In [13]:
clima.head()

Unnamed: 0,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,5,7,2,-2,0,2,3,2,-1,-1,-2,-2,4,5,7,4,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 230, 'speed':...",12,15,2,1029,none,0,0,270,2,USA
1,2,0,-1,-1,1,2,2,2,-1,-1,1,6,5,5,10,5,6,5,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 2}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 245, 'speed':...",12,15,1,1028,none,0,0,210,2,USA
2,3,0,-1,-1,1,1,2,2,1,3,6,6,4,12,7,4,9,1,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 235, 'speed':...",13,15,4,1026,none,0,0,200,3,USA
3,6,2,0,0,0,1,2,2,4,5,5,7,8,15,13,10,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",14,10,10,1026,none,0,0,215,2,USA
4,6,1,-1,-1,0,1,2,1,2,1,8,12,16,16,16,13,15,8,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",13,10,10,1026,none,0,0,245,2,USA


### Desempaquetamos la columna 'wind_profile':

In [14]:
clima['wind_profile'] = clima['wind_profile'].apply(ast.literal_eval)

In [15]:
df_wind = clima['wind_profile'].apply(pd.Series)
df_wind.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'direction': 230, 'speed': 2}","{'layer': '900mb', 'direction': 230, 'speed': 2}","{'layer': '850mb', 'direction': 275, 'speed': 2}","{'layer': '800mb', 'direction': 320, 'speed': 2}","{'layer': '750mb', 'direction': 345, 'speed': 3}","{'layer': '700mb', 'direction': 15, 'speed': 3}","{'layer': '650mb', 'direction': 20, 'speed': 3}","{'layer': '600mb', 'direction': 0, 'speed': 3}","{'layer': '550mb', 'direction': 5, 'speed': 3}","{'layer': '500mb', 'direction': 0, 'speed': 3}","{'layer': '450mb', 'direction': 345, 'speed': 4}","{'layer': '400mb', 'direction': 345, 'speed': 4}","{'layer': '350mb', 'direction': 345, 'speed': 5}","{'layer': '300mb', 'direction': 15, 'speed': 5}","{'layer': '250mb', 'direction': 5, 'speed': 6}","{'layer': '200mb', 'direction': 15, 'speed': 5}"
1,"{'layer': '950mb', 'direction': 245, 'speed': 2}","{'layer': '900mb', 'direction': 275, 'speed': 2}","{'layer': '850mb', 'direction': 310, 'speed': 2}","{'layer': '800mb', 'direction': 320, 'speed': 3}","{'layer': '750mb', 'direction': 345, 'speed': 3}","{'layer': '700mb', 'direction': 0, 'speed': 3}","{'layer': '650mb', 'direction': 345, 'speed': 3}","{'layer': '600mb', 'direction': 320, 'speed': 3}","{'layer': '550mb', 'direction': 310, 'speed': 3}","{'layer': '500mb', 'direction': 315, 'speed': 3}","{'layer': '450mb', 'direction': 325, 'speed': 3}","{'layer': '400mb', 'direction': 325, 'speed': 4}","{'layer': '350mb', 'direction': 0, 'speed': 4}","{'layer': '300mb', 'direction': 15, 'speed': 6}","{'layer': '250mb', 'direction': 10, 'speed': 7}","{'layer': '200mb', 'direction': 15, 'speed': 6}"


In [16]:
##generamos un for para volcar como valores y nombres de columnas los diccionarios encerrados en el valor de las columnas
for i in range(len(df_wind.columns)): 

    nombre = "wind_direction_" + str(df_wind[i].apply(pd.Series)["layer"][0]) 

    valores = list(df_wind[i].apply(pd.Series)["direction"])

    clima.insert(i, nombre, valores)

In [17]:
clima.head()

Unnamed: 0,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,230,230,275,320,345,15,20,0,5,0,345,345,345,15,5,15,5,7,2,-2,0,2,3,2,-1,-1,-2,-2,4,5,7,4,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 230, 'speed':...",12,15,2,1029,none,0,0,270,2,USA
1,245,275,310,320,345,0,345,320,310,315,325,325,0,15,10,15,2,0,-1,-1,1,2,2,2,-1,-1,1,6,5,5,10,5,6,5,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 2}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 245, 'speed':...",12,15,1,1028,none,0,0,210,2,USA
2,235,290,295,315,325,320,315,315,315,310,320,340,340,10,0,0,3,0,-1,-1,1,1,2,2,1,3,6,6,4,12,7,4,9,1,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 235, 'speed':...",13,15,4,1026,none,0,0,200,3,USA
3,220,255,290,305,305,305,315,325,320,325,320,315,335,335,350,345,6,2,0,0,0,1,2,2,4,5,5,7,8,15,13,10,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",14,10,10,1026,none,0,0,215,2,USA
4,220,245,300,315,305,300,305,315,325,320,310,305,320,325,330,325,6,1,-1,-1,0,1,2,1,2,1,8,12,16,16,16,13,15,8,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",13,10,10,1026,none,0,0,245,2,USA


In [18]:
#generamos un for para volcar como valores y nombres de columnas los diccionarios encerrados en el valor de las columnas
for i in range(len(df_wind.columns)): 

    nombre = "wind_speed_" + str(df_wind[i].apply(pd.Series)["layer"][0]) 

    valores = list(df_wind[i].apply(pd.Series)["speed"])

    clima.insert(i, nombre, valores)

In [19]:
clima.head()

Unnamed: 0,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,2,2,2,2,3,3,3,3,3,3,4,4,5,5,6,5,230,230,275,320,345,15,20,0,5,0,345,345,345,15,5,15,5,7,2,-2,0,2,3,2,-1,-1,-2,-2,4,5,7,4,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 230, 'speed':...",12,15,2,1029,none,0,0,270,2,USA
1,2,2,2,3,3,3,3,3,3,3,3,4,4,6,7,6,245,275,310,320,345,0,345,320,310,315,325,325,0,15,10,15,2,0,-1,-1,1,2,2,2,-1,-1,1,6,5,5,10,5,6,5,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 2}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 245, 'speed':...",12,15,1,1028,none,0,0,210,2,USA
2,2,2,2,3,3,3,3,4,4,3,3,3,4,5,6,6,235,290,295,315,325,320,315,315,315,310,320,340,340,10,0,0,3,0,-1,-1,1,1,2,2,1,3,6,6,4,12,7,4,9,1,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 235, 'speed':...",13,15,4,1026,none,0,0,200,3,USA
3,3,2,3,3,3,3,3,3,4,4,4,3,4,5,6,7,220,255,290,305,305,305,315,325,320,325,320,315,335,335,350,345,6,2,0,0,0,1,2,2,4,5,5,7,8,15,13,10,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",14,10,10,1026,none,0,0,215,2,USA
4,3,2,2,3,3,3,4,4,3,4,5,5,5,5,6,6,220,245,300,315,305,300,305,315,325,320,310,305,320,325,330,325,6,1,-1,-1,0,1,2,1,2,1,8,12,16,16,16,13,15,8,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 220, 'speed':...",13,10,10,1026,none,0,0,245,2,USA


In [20]:
#comprobamos que los nombres de las columnas se han modificado correctamente:
clima.columns

Index(['wind_speed_950mb', 'wind_speed_900mb', 'wind_speed_850mb',
       'wind_speed_800mb', 'wind_speed_750mb', 'wind_speed_700mb',
       'wind_speed_650mb', 'wind_speed_600mb', 'wind_speed_550mb',
       'wind_speed_500mb', 'wind_speed_450mb', 'wind_speed_400mb',
       'wind_speed_350mb', 'wind_speed_300mb', 'wind_speed_250mb',
       'wind_speed_200mb', 'wind_direction_950mb', 'wind_direction_900mb',
       'wind_direction_850mb', 'wind_direction_800mb', 'wind_direction_750mb',
       'wind_direction_700mb', 'wind_direction_650mb', 'wind_direction_600mb',
       'wind_direction_550mb', 'wind_direction_500mb', 'wind_direction_450mb',
       'wind_direction_400mb', 'wind_direction_350mb', 'wind_direction_300mb',
       'wind_direction_250mb', 'wind_direction_200mb', 'rh_950mb', 'rh_900mb',
       'rh_850mb', 'rh_800mb', 'rh_750mb', 'rh_700mb', 'rh_650mb', 'rh_600mb',
       'rh_550mb', 'rh_500mb', 'rh_450mb', 'rh_400mb', 'rh_350mb', 'rh_300mb',
       'rh_250mb', 'rh_200mb', 'timep

### Borramos las dos columnas que hemos desempaquetado, para no tener información repetida.

In [21]:
clima.drop(['rh_profile', 'wind_profile'], axis = 1, inplace=True)

In [22]:
clima.head()

Unnamed: 0,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,2,2,2,2,3,3,3,3,3,3,4,4,5,5,6,5,230,230,275,320,345,15,20,0,5,0,345,345,345,15,5,15,5,7,2,-2,0,2,3,2,-1,-1,-2,-2,4,5,7,4,3,9,-9999,-9999,-9999,12,15,2,1029,none,0,0,270,2,USA
1,2,2,2,3,3,3,3,3,3,3,3,4,4,6,7,6,245,275,310,320,345,0,345,320,310,315,325,325,0,15,10,15,2,0,-1,-1,1,2,2,2,-1,-1,1,6,5,5,10,5,6,5,-9999,-9999,-9999,12,15,1,1028,none,0,0,210,2,USA
2,2,2,2,3,3,3,3,4,4,3,3,3,4,5,6,6,235,290,295,315,325,320,315,315,315,310,320,340,340,10,0,0,3,0,-1,-1,1,1,2,2,1,3,6,6,4,12,7,4,9,1,-9999,-9999,-9999,13,15,4,1026,none,0,0,200,3,USA
3,3,2,3,3,3,3,3,3,4,4,4,3,4,5,6,7,220,255,290,305,305,305,315,325,320,325,320,315,335,335,350,345,6,2,0,0,0,1,2,2,4,5,5,7,8,15,13,10,12,3,-9999,-9999,-9999,14,10,10,1026,none,0,0,215,2,USA
4,3,2,2,3,3,3,4,4,3,4,5,5,5,5,6,6,220,245,300,315,305,300,305,315,325,320,310,305,320,325,330,325,6,1,-1,-1,0,1,2,1,2,1,8,12,16,16,16,13,15,8,-9999,-9999,-9999,13,10,10,1026,none,0,0,245,2,USA


In [23]:
#pasamos los valores de la columna a minúsculas para poder fusionar los df
clima['country'] = clima['country'].str.lower()

In [24]:
#comprobamos que se ha aplicado el cambio correctamente
clima['country'].unique()

array(['usa', 'australia', 'south africa', 'new zealand',
       'papua new guinea'], dtype=object)

In [25]:
#guardamos el csv ya que más adelante necesitaremos el df de clima desempaquetado antes de unirlo con el df de attacks
clima.to_csv('datos/clima-clase.csv')

In [26]:
#generamos un df para mergearlo con df_country (el df que solo tiene 5 países)
df_groupby = clima.groupby(["country"])[clima.columns].mean().reset_index()

In [27]:
df_groupby.head()

Unnamed: 0,country,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed
0,australia,3.546875,3.546875,3.375,3.3125,3.328125,3.328125,3.375,3.484375,3.5,3.65625,3.734375,3.65625,3.6875,4.046875,4.234375,4.328125,70.3125,71.328125,68.828125,70.078125,71.953125,79.53125,93.046875,101.015625,84.765625,79.84375,85.859375,96.5625,95.859375,107.109375,110.9375,123.59375,13.640625,11.546875,8.078125,5.5,4.1875,3.625,2.765625,2.15625,2.359375,2.828125,3.890625,3.90625,3.46875,3.84375,4.8125,5.125,97.5,2.890625,-9999.0,-9999.0,-9999.0,25.890625,-3.546875,10.53125,1016.453125,2.515625,0.0,72.34375,3.203125
1,new zealand,3.328125,3.171875,3.109375,3.09375,3.09375,3.09375,3.09375,3.09375,3.15625,3.203125,3.203125,3.46875,3.796875,4.234375,5.140625,6.015625,119.765625,120.46875,122.890625,126.953125,129.765625,132.34375,141.328125,144.453125,153.046875,153.828125,186.09375,187.03125,212.03125,208.828125,224.921875,216.09375,12.3125,12.484375,4.6875,1.046875,0.65625,0.953125,1.25,1.109375,0.453125,0.265625,1.140625,3.328125,7.0,6.140625,7.109375,2.671875,97.5,6.453125,-9999.0,-9999.0,-9999.0,15.046875,11.3125,9.890625,1022.828125,2.75,0.0,120.625,3.015625
2,papua new guinea,3.984375,4.515625,4.625,4.53125,3.921875,3.109375,2.234375,2.640625,3.390625,4.1875,4.671875,4.75,4.859375,4.828125,5.4375,6.109375,82.8125,81.40625,82.578125,82.96875,80.3125,77.8125,176.484375,236.5625,251.25,257.421875,257.5,252.421875,249.375,234.6875,223.203125,235.390625,13.75,11.546875,8.359375,5.546875,2.59375,0.4375,0.359375,1.40625,2.703125,2.234375,3.09375,3.59375,5.09375,4.953125,3.78125,2.375,97.5,4.09375,-9999.0,-9999.0,-9999.0,25.8125,-0.859375,11.28125,1009.796875,2.328125,0.0,82.578125,3.3125
3,south africa,2.390625,2.265625,2.109375,2.046875,2.0625,2.140625,2.265625,2.5,2.78125,3.125,3.390625,3.875,3.890625,3.859375,4.0625,4.46875,143.046875,170.859375,180.0,201.09375,210.3125,222.03125,230.0,235.15625,223.046875,219.21875,220.78125,225.859375,224.21875,225.625,208.984375,200.9375,12.96875,10.703125,9.984375,10.640625,10.703125,10.109375,9.875,8.390625,6.1875,4.359375,2.515625,1.328125,2.09375,2.703125,5.234375,8.578125,97.5,4.859375,-9999.0,-9999.0,-9999.0,24.078125,1.53125,10.59375,1019.765625,1.859375,0.0,150.625,2.34375
4,usa,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75


In [28]:
#mergeamos los df mencionados anteriormente
df_completo = df_country.merge(df_groupby, how='outer', on = 'country', right_on= None, left_on=None)

Revisamos la estructura de los DFs:

In [29]:
print(f'El DF_groupby contiene {df_groupby.shape[0]} filas y {df_groupby.shape[1]} columnas.')

El DF_groupby contiene 5 filas y 62 columnas.


In [30]:
print(f'El DF_country contiene {df_country.shape[0]} filas y {df_country.shape[1]} columnas.')

El DF_country contiene 383 filas y 9 columnas.


In [31]:
print(f'El DF_completo contiene {df_completo.shape[0]} filas y {df_completo.shape[1]} columnas.')

El DF_completo contiene 383 filas y 70 columnas.


In [32]:
df_completo.head()

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75
1,2018,Unprovoked,usa,Walking,15.0,Bull Shark,May,N,M,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75
2,2018,Unprovoked,usa,Stand-Up Paddleboarding,25.0,Tiger Shark,Mar,N,M,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75
3,2017,Unprovoked,usa,Surfing,54.0,Tiger Shark,Dec,N,F,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75
4,2017,Unprovoked,usa,Spearfishing,25.0,White Shark,Nov,N,M,3.15625,3.359375,3.640625,3.8125,4.0625,4.375,4.84375,5.25,5.65625,6.234375,6.890625,7.5625,8.109375,8.734375,9.375,9.5625,247.03125,259.296875,282.1875,289.84375,294.921875,282.421875,285.3125,282.65625,282.8125,281.796875,286.328125,286.796875,283.75,264.296875,266.171875,280.390625,4.34375,4.84375,5.234375,4.59375,3.65625,3.03125,2.078125,1.546875,2.375,3.359375,3.96875,3.90625,4.609375,6.03125,6.234375,5.484375,97.5,4.421875,-9999.0,-9999.0,-9999.0,12.859375,11.609375,4.234375,1014.84375,0.5625,0.0,236.328125,2.75



4. Guardad los resultados obtenidos en un csv que usaremos en próximos ejercicios de pair programming.

In [33]:
df_completo.to_csv('datos/ETL-2.csv', index=False)