# ETL 2: Transformación 1 - Limpieza

In [62]:
from IPython.core.interactiveshell import InteractiveShell # Nos permite mostar más de una salida por celda
InteractiveShell.ast_node_interactivity = "all" # Nos permite mostar más de una salida por celda

import requests
import pandas as pd
import numpy as np
import ast 

from datetime import datetime, timedelta

pd.options.display.max_columns=None

## Introducción:
Tendréis que usar el csv attacks_limpieza_completa que tenéis adjunto abajo.
En la lección de hoy aprendimos como transformar nuestros datos para que estén preparados para almacearlos en una BBDD. En este momento tenemos dos fuentes de datos:

- El csv con los ataques de tiburones que hemos estado limpiando hasta ahora, el que os hemos adjuntado (attacks_limpieza_completa). Sentiros libres de usar vuestros propios csv en caso de que queráis.
- El csv con los datos climáticos de los principales paises que tienen ataques de tiburones, el que creamos en el pair programming de ayer.

**El objetivo de la sesión de hoy será juntar en un único csv la información de ambas fuentes**. 

Para ello:
1. Cargaremos los dos ficheros de datos del dataframe de los ataques nos quedaremos solo con las filas de los países que seleccionamos en la lección de ayer:
    - USA
    - Australia
    - New Zealand
    - South Africa
    - Papua New Guinea  
  

In [63]:
clima = pd.read_csv('datos/ETL-1.csv', index_col = 0)
clima.head(2)

Unnamed: 0,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 320, 'speed':...",13,15,5,1024,none,0,0,265,2,USA
1,6,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 355, 'speed':...",12,15,5,1024,none,0,0,345,2,USA


In [64]:
df = pd.read_csv('datos/Limpieza-4.csv', index_col=0)
df.head(2)

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F
1,2018,Unprovoked,brazil,Swimming,18.0,Tiger Shark,Jun,Y,M


### Nos quedamos con los países que nos piden:

In [65]:
list_country = ['usa', 'australia', 'south africa', 'new zealand', 'papua new guinea']

In [66]:
df_country = df[df['country'].isin(list_country)]
df_country.head()

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F
2,2018,Unprovoked,usa,Walking,15.0,Bull Shark,May,N,M
3,2018,Provoked,australia,Feeding sharks,32.0,Grey Shark,May,N,M
6,2018,Unprovoked,australia,Surfing,60.0,Unspecified,Apr,N,M
8,2018,Unprovoked,south africa,Paddle-skiing,33.0,White Shark,Apr,N,M


In [67]:
df_country['country'].unique()

array(['usa', 'australia', 'south africa', 'new zealand',
       'papua new guinea'], dtype=object)

### Desempaquetamos la columna 'hr_profile':

In [68]:
clima['rh_profile'] = clima['rh_profile'].apply(ast.literal_eval)

In [69]:
df_rh = clima['rh_profile'].apply(pd.Series)

In [70]:
df_rh.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'rh': 4}","{'layer': '900mb', 'rh': 3}","{'layer': '850mb', 'rh': 1}","{'layer': '800mb', 'rh': 0}","{'layer': '750mb', 'rh': 0}","{'layer': '700mb', 'rh': 0}","{'layer': '650mb', 'rh': 0}","{'layer': '600mb', 'rh': -1}","{'layer': '550mb', 'rh': 1}","{'layer': '500mb', 'rh': 4}","{'layer': '450mb', 'rh': 5}","{'layer': '400mb', 'rh': 10}","{'layer': '350mb', 'rh': 11}","{'layer': '300mb', 'rh': 14}","{'layer': '250mb', 'rh': 14}","{'layer': '200mb', 'rh': 5}"
1,"{'layer': '950mb', 'rh': 3}","{'layer': '900mb', 'rh': 3}","{'layer': '850mb', 'rh': 1}","{'layer': '800mb', 'rh': 0}","{'layer': '750mb', 'rh': -1}","{'layer': '700mb', 'rh': -1}","{'layer': '650mb', 'rh': -2}","{'layer': '600mb', 'rh': -1}","{'layer': '550mb', 'rh': 2}","{'layer': '500mb', 'rh': 5}","{'layer': '450mb', 'rh': 9}","{'layer': '400mb', 'rh': 10}","{'layer': '350mb', 'rh': 12}","{'layer': '300mb', 'rh': 10}","{'layer': '250mb', 'rh': 13}","{'layer': '200mb', 'rh': 6}"


In [71]:
for i in range(len(df_rh.columns)):
    nombre_rh = "rh_"+ str(df_rh[i].apply(pd.Series)["layer"][0])
    valores_rh = (df_rh[i].apply(pd.Series)["rh"])

    clima.insert(i, nombre_rh, valores_rh)

In [72]:
clima.head()

Unnamed: 0,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,4,3,1,0,0,0,0,-1,1,4,5,10,11,14,14,5,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 320, 'speed':...",13,15,5,1024,none,0,0,265,2,USA
1,3,3,1,0,-1,-1,-2,-1,2,5,9,10,12,10,13,6,6,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 355, 'speed':...",12,15,5,1024,none,0,0,345,2,USA
2,4,4,4,1,-1,-2,-2,-1,2,7,11,9,10,11,14,10,9,2,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",13,15,4,1024,none,0,0,260,2,USA
3,4,5,7,6,4,3,0,-1,2,8,11,9,12,14,13,13,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",15,10,2,1024,none,0,0,175,2,USA
4,5,7,9,13,9,7,4,1,3,10,11,12,15,15,16,14,15,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 265, 'speed':...",17,6,3,1024,none,0,0,350,2,USA


### Desempaquetamos la columna 'wind_profile':

In [73]:
clima['wind_profile'] = clima['wind_profile'].apply(ast.literal_eval)

In [74]:
df_wind = clima['wind_profile'].apply(pd.Series)
df_wind.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'direction': 320, 'speed': 2}","{'layer': '900mb', 'direction': 295, 'speed': 2}","{'layer': '850mb', 'direction': 290, 'speed': 3}","{'layer': '800mb', 'direction': 285, 'speed': 4}","{'layer': '750mb', 'direction': 280, 'speed': 4}","{'layer': '700mb', 'direction': 270, 'speed': 5}","{'layer': '650mb', 'direction': 275, 'speed': 5}","{'layer': '600mb', 'direction': 285, 'speed': 5}","{'layer': '550mb', 'direction': 290, 'speed': 5}","{'layer': '500mb', 'direction': 290, 'speed': 5}","{'layer': '450mb', 'direction': 290, 'speed': 5}","{'layer': '400mb', 'direction': 275, 'speed': 5}","{'layer': '350mb', 'direction': 265, 'speed': 5}","{'layer': '300mb', 'direction': 270, 'speed': 5}","{'layer': '250mb', 'direction': 265, 'speed': 5}","{'layer': '200mb', 'direction': 255, 'speed': 6}"
1,"{'layer': '950mb', 'direction': 355, 'speed': 2}","{'layer': '900mb', 'direction': 310, 'speed': 3}","{'layer': '850mb', 'direction': 305, 'speed': 4}","{'layer': '800mb', 'direction': 285, 'speed': 5}","{'layer': '750mb', 'direction': 275, 'speed': 5}","{'layer': '700mb', 'direction': 270, 'speed': 5}","{'layer': '650mb', 'direction': 270, 'speed': 5}","{'layer': '600mb', 'direction': 275, 'speed': 5}","{'layer': '550mb', 'direction': 280, 'speed': 5}","{'layer': '500mb', 'direction': 280, 'speed': 6}","{'layer': '450mb', 'direction': 275, 'speed': 5}","{'layer': '400mb', 'direction': 255, 'speed': 5}","{'layer': '350mb', 'direction': 240, 'speed': 5}","{'layer': '300mb', 'direction': 250, 'speed': 6}","{'layer': '250mb', 'direction': 255, 'speed': 6}","{'layer': '200mb', 'direction': 255, 'speed': 6}"


In [75]:
for i in range(len(df_wind.columns)): 

    nombre = "wind_direction_" + str(df_wind[i].apply(pd.Series)["layer"][0]) 

    valores = list(df_wind[i].apply(pd.Series)["direction"])

    clima.insert(i, nombre, valores)

In [76]:
clima.head()

Unnamed: 0,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,320,295,290,285,280,270,275,285,290,290,290,275,265,270,265,255,4,3,1,0,0,0,0,-1,1,4,5,10,11,14,14,5,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 320, 'speed':...",13,15,5,1024,none,0,0,265,2,USA
1,355,310,305,285,275,270,270,275,280,280,275,255,240,250,255,255,3,3,1,0,-1,-1,-2,-1,2,5,9,10,12,10,13,6,6,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 355, 'speed':...",12,15,5,1024,none,0,0,345,2,USA
2,5,30,10,325,295,285,270,260,255,260,255,240,240,245,255,260,4,4,4,1,-1,-2,-2,-1,2,7,11,9,10,11,14,10,9,2,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",13,15,4,1024,none,0,0,260,2,USA
3,5,345,330,310,285,270,265,250,250,255,245,240,245,245,250,255,4,5,7,6,4,3,0,-1,2,8,11,9,12,14,13,13,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",15,10,2,1024,none,0,0,175,2,USA
4,265,265,270,285,280,270,275,270,255,255,250,255,250,250,250,255,5,7,9,13,9,7,4,1,3,10,11,12,15,15,16,14,15,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 265, 'speed':...",17,6,3,1024,none,0,0,350,2,USA


In [77]:
for i in range(len(df_wind.columns)): 

    nombre = "wind_speed_" + str(df_wind[i].apply(pd.Series)["layer"][0]) 

    valores = list(df_wind[i].apply(pd.Series)["speed"])

    clima.insert(i, nombre, valores)

In [78]:
clima.head()

Unnamed: 0,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,2,2,3,4,4,5,5,5,5,5,5,5,5,5,5,6,320,295,290,285,280,270,275,285,290,290,290,275,265,270,265,255,4,3,1,0,0,0,0,-1,1,4,5,10,11,14,14,5,3,9,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 320, 'speed':...",13,15,5,1024,none,0,0,265,2,USA
1,2,3,4,5,5,5,5,5,5,6,5,5,5,6,6,6,355,310,305,285,275,270,270,275,280,280,275,255,240,250,255,255,3,3,1,0,-1,-1,-2,-1,2,5,9,10,12,10,13,6,6,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 3}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 355, 'speed':...",12,15,5,1024,none,0,0,345,2,USA
2,2,2,2,3,3,5,5,5,5,6,6,6,6,6,6,6,5,30,10,325,295,285,270,260,255,260,255,240,240,245,255,260,4,4,4,1,-1,-2,-2,-1,2,7,11,9,10,11,14,10,9,2,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",13,15,4,1024,none,0,0,260,2,USA
3,2,3,3,3,3,4,5,6,6,6,6,6,6,6,7,7,5,345,330,310,285,270,265,250,250,255,245,240,245,245,250,255,4,5,7,6,4,3,0,-1,2,8,11,9,12,14,13,13,12,3,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 4}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 5, 'speed': 2...",15,10,2,1024,none,0,0,175,2,USA
4,2,3,3,3,4,5,6,6,6,6,6,6,7,7,7,7,265,265,270,285,280,270,275,270,255,255,250,255,250,250,250,255,5,7,9,13,9,7,4,1,3,10,11,12,15,15,16,14,15,6,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 265, 'speed':...",17,6,3,1024,none,0,0,350,2,USA


In [79]:
clima.columns

Index(['wind_speed_950mb', 'wind_speed_900mb', 'wind_speed_850mb',
       'wind_speed_800mb', 'wind_speed_750mb', 'wind_speed_700mb',
       'wind_speed_650mb', 'wind_speed_600mb', 'wind_speed_550mb',
       'wind_speed_500mb', 'wind_speed_450mb', 'wind_speed_400mb',
       'wind_speed_350mb', 'wind_speed_300mb', 'wind_speed_250mb',
       'wind_speed_200mb', 'wind_direction_950mb', 'wind_direction_900mb',
       'wind_direction_850mb', 'wind_direction_800mb', 'wind_direction_750mb',
       'wind_direction_700mb', 'wind_direction_650mb', 'wind_direction_600mb',
       'wind_direction_550mb', 'wind_direction_500mb', 'wind_direction_450mb',
       'wind_direction_400mb', 'wind_direction_350mb', 'wind_direction_300mb',
       'wind_direction_250mb', 'wind_direction_200mb', 'rh_950mb', 'rh_900mb',
       'rh_850mb', 'rh_800mb', 'rh_750mb', 'rh_700mb', 'rh_650mb', 'rh_600mb',
       'rh_550mb', 'rh_500mb', 'rh_450mb', 'rh_400mb', 'rh_350mb', 'rh_300mb',
       'rh_250mb', 'rh_200mb', 'timep

### Borramos las dos columnas que hemos desempaquetado, para no tener información repetida.

In [80]:
clima.drop(['rh_profile', 'wind_profile'], axis = 1, inplace=True)

In [81]:
clima.head()

Unnamed: 0,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country
0,2,2,3,4,4,5,5,5,5,5,5,5,5,5,5,6,320,295,290,285,280,270,275,285,290,290,290,275,265,270,265,255,4,3,1,0,0,0,0,-1,1,4,5,10,11,14,14,5,3,9,-9999,-9999,-9999,13,15,5,1024,none,0,0,265,2,USA
1,2,3,4,5,5,5,5,5,5,6,5,5,5,6,6,6,355,310,305,285,275,270,270,275,280,280,275,255,240,250,255,255,3,3,1,0,-1,-1,-2,-1,2,5,9,10,12,10,13,6,6,6,-9999,-9999,-9999,12,15,5,1024,none,0,0,345,2,USA
2,2,2,2,3,3,5,5,5,5,6,6,6,6,6,6,6,5,30,10,325,295,285,270,260,255,260,255,240,240,245,255,260,4,4,4,1,-1,-2,-2,-1,2,7,11,9,10,11,14,10,9,2,-9999,-9999,-9999,13,15,4,1024,none,0,0,260,2,USA
3,2,3,3,3,3,4,5,6,6,6,6,6,6,6,7,7,5,345,330,310,285,270,265,250,250,255,245,240,245,245,250,255,4,5,7,6,4,3,0,-1,2,8,11,9,12,14,13,13,12,3,-9999,-9999,-9999,15,10,2,1024,none,0,0,175,2,USA
4,2,3,3,3,4,5,6,6,6,6,6,6,7,7,7,7,265,265,270,285,280,270,275,270,255,255,250,255,250,250,250,255,5,7,9,13,9,7,4,1,3,10,11,12,15,15,16,14,15,6,-9999,-9999,-9999,17,6,3,1024,none,0,0,350,2,USA


In [82]:
clima['country'] = clima['country'].str.lower()

In [83]:
clima['country'].unique()

array(['usa', 'australia', 'south africa', 'new zealand',
       'papua new guinea'], dtype=object)

In [84]:
clima.to_csv('datos/clima-clase.csv')

In [85]:
df_groupby = clima.groupby(["country"])[clima.columns].mean().reset_index()

  df_groupby = clima.groupby(["country"])[clima.columns].mean().reset_index()


In [86]:
df_groupby.head()

Unnamed: 0,country,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed
0,australia,3.546875,3.5625,3.5,3.421875,3.390625,3.515625,3.46875,3.640625,3.828125,3.875,3.875,3.8125,3.78125,4.03125,4.75,5.15625,114.0625,115.46875,114.0625,110.3125,112.8125,115.78125,129.21875,137.03125,144.84375,145.15625,139.609375,141.5625,121.40625,127.5,147.109375,158.203125,13.53125,11.53125,8.421875,6.265625,5.375,4.71875,4.09375,3.859375,3.90625,4.828125,4.6875,4.109375,3.1875,2.953125,3.671875,5.375,97.5,3.90625,-9999.0,-9999.0,-9999.0,25.875,-3.78125,10.71875,1015.640625,2.703125,0.0,114.84375,3.3125
1,new zealand,3.265625,3.296875,3.34375,3.1875,3.265625,3.3125,3.421875,3.546875,3.625,3.8125,4.015625,4.234375,4.546875,4.984375,5.828125,6.5625,157.578125,168.4375,185.390625,187.03125,188.4375,186.796875,235.390625,259.453125,263.359375,266.328125,264.921875,257.890625,250.390625,235.859375,243.125,254.84375,12.390625,13.34375,8.203125,2.203125,1.640625,1.734375,1.484375,0.90625,0.515625,0.609375,0.546875,1.734375,2.78125,3.421875,2.4375,-0.328125,97.5,6.4375,-9999.0,-9999.0,-9999.0,15.0625,10.546875,9.84375,1019.03125,3.890625,0.0,148.515625,2.96875
2,papua new guinea,3.78125,4.1875,4.234375,4.09375,3.359375,2.59375,2.40625,2.765625,3.421875,3.890625,4.296875,4.25,4.390625,4.53125,5.265625,5.625,84.140625,82.03125,81.875,81.40625,78.984375,73.515625,176.796875,229.765625,255.546875,261.484375,259.921875,246.25,233.28125,216.09375,198.515625,218.984375,13.578125,12.546875,9.40625,6.453125,3.46875,1.78125,1.15625,0.90625,-0.359375,-1.328125,0.53125,0.796875,2.203125,2.71875,3.71875,2.515625,97.5,3.84375,-9999.0,-9999.0,-9999.0,25.953125,-0.53125,11.0625,1010.28125,2.25,0.0,85.546875,3.234375
3,south africa,2.46875,2.265625,2.15625,2.015625,2.0625,2.125,2.28125,2.546875,2.5625,3.046875,3.546875,3.71875,3.953125,4.1875,4.609375,5.28125,153.515625,141.328125,148.203125,169.84375,199.609375,204.296875,209.140625,203.4375,216.640625,230.078125,228.359375,233.515625,219.21875,217.421875,218.4375,208.125,13.078125,9.828125,6.359375,7.03125,8.234375,8.46875,9.921875,7.359375,4.21875,3.203125,1.75,2.453125,4.75,5.40625,8.34375,10.140625,97.5,6.375,-9999.0,-9999.0,-9999.0,23.8125,2.390625,9.90625,1019.40625,1.109375,0.0,139.140625,2.453125
4,usa,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875


In [87]:
df_completo = df_country.merge(df_groupby, how='outer', on = 'country', right_on= None, left_on=None)

In [88]:
df_groupby.shape

(5, 62)

In [89]:
df_country.shape

(383, 9)

In [90]:
df_completo.shape

(383, 70)

In [91]:
df_completo.head()

Unnamed: 0,year,type,country,activity,age,species,month,fatal,sex,wind_speed_950mb,wind_speed_900mb,wind_speed_850mb,wind_speed_800mb,wind_speed_750mb,wind_speed_700mb,wind_speed_650mb,wind_speed_600mb,wind_speed_550mb,wind_speed_500mb,wind_speed_450mb,wind_speed_400mb,wind_speed_350mb,wind_speed_300mb,wind_speed_250mb,wind_speed_200mb,wind_direction_950mb,wind_direction_900mb,wind_direction_850mb,wind_direction_800mb,wind_direction_750mb,wind_direction_700mb,wind_direction_650mb,wind_direction_600mb,wind_direction_550mb,wind_direction_500mb,wind_direction_450mb,wind_direction_400mb,wind_direction_350mb,wind_direction_300mb,wind_direction_250mb,wind_direction_200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed
0,2018,Boating,usa,Paddling,57.0,White Shark,Jun,N,F,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875
1,2018,Unprovoked,usa,Walking,15.0,Bull Shark,May,N,M,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875
2,2018,Unprovoked,usa,Stand-Up Paddleboarding,25.0,Tiger Shark,Mar,N,M,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875
3,2017,Unprovoked,usa,Surfing,54.0,Tiger Shark,Dec,N,F,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875
4,2017,Unprovoked,usa,Spearfishing,25.0,White Shark,Nov,N,M,2.515625,2.796875,3.09375,3.453125,3.671875,3.96875,4.328125,4.65625,5.0,5.34375,5.578125,5.796875,6.046875,6.515625,7.140625,6.828125,227.890625,245.390625,284.53125,303.828125,290.15625,293.90625,273.59375,277.34375,276.796875,266.25,265.703125,259.765625,255.0,248.828125,256.5625,275.703125,4.1875,4.625,5.234375,4.984375,4.21875,4.078125,3.296875,2.671875,4.09375,4.890625,4.875,4.59375,6.0,7.859375,8.3125,6.46875,97.5,4.921875,-9999.0,-9999.0,-9999.0,12.75,12.078125,4.296875,1019.296875,0.21875,0.0,238.515625,2.296875



4. Guardad los resultados obtenidos en un csv que usaremos en próximos ejercicios de pair programming.

In [92]:
df_completo.to_csv('datos/ETL-2.csv', index=False)