# Caso de Estudio COVID-19

In [136]:
# Librerías
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
import seaborn as sns

# para que el fonde las gráficas esté por default 
# esto en caso no hayan datos, saldrá en blanco la gráfica
sns.set()

### Reombrando las columnas

In [137]:
def nombra_columnas(data):
    # A la izquierda va el nombre original y la derecha el 
    # el nuevo nombre o título de la columna
    # se recomienda que las variables comiencen con minúscula
    new = data.rename(
                columns = {'Province/State':'subregion',
                          'Country/Region':'país',
                          'Lat':'lat',
                          'Long':'long'})
    return new

### Formato tidy

In [138]:
def union_date_data(data, column_name):
    
    # id_vars: mantener como en original
    # var_name: respecto a que variable se reduce el dataframe
    # value_name: el nuevo nombre que tendrá la columna
    new = data.melt(id_vars = ['país', 'subregion','lat','long'],
                   var_name = 'date_RAW',
                   value_name = column_name)
    return new

### Fechas en formato datetime

In [139]:
def convert_date(data):
    new = data.assign(
    date = pd.to_datetime(data.date_RAW,format='%m/%d/%y'))
    new.drop(columns = ['date_RAW'],inplace = True)
    return new

### Reordenar datos

In [140]:
def reordenar_data(data,column_name):
    new = (data.filter(
    ['país','subregion','date','lat','long',column_name])
          .sort_values(['país','subregion','date','lat','long'])
          .reset_index(drop = True))
    return new

### Obtener los datos

In [187]:
def get_data(url,var_name):
    data_inprocess = pd.read_csv(url)
    data_inprocess = nombra_columnas(data_inprocess)
    data_inprocess = union_date_data(data_inprocess,var_name)
    data_inprocess = convert_date(data_inprocess)
    data_inprocess = reordenar_data(data_inprocess, var_name)
    
    return data_inprocess

In [188]:
confirmados = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv'
muertos = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_deaths_global.csv&filename=time_series_covid19_deaths_global.csv'
recuperados = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_recovered_global.csv&filename=time_series_covid19_recovered_global.csv'


confirmados = get_data(confirmados,'confirmados')
muertos = get_data(muertos,'muertes')
recuperados = get_data(recuperados,'recuperados')

In [189]:
confirmados

Unnamed: 0,país,subregion,date,lat,long,confirmados
0,Afghanistan,,2020-01-22,33.939110,67.709953,0
1,Afghanistan,,2020-01-23,33.939110,67.709953,0
2,Afghanistan,,2020-01-24,33.939110,67.709953,0
3,Afghanistan,,2020-01-25,33.939110,67.709953,0
4,Afghanistan,,2020-01-26,33.939110,67.709953,0
...,...,...,...,...,...,...
290151,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893
290152,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893
290153,Zimbabwe,,2022-10-19,-19.015438,29.154857,257893
290154,Zimbabwe,,2022-10-20,-19.015438,29.154857,257893


In [190]:
muertos

Unnamed: 0,país,subregion,date,lat,long,muertes
0,Afghanistan,,2020-01-22,33.939110,67.709953,0
1,Afghanistan,,2020-01-23,33.939110,67.709953,0
2,Afghanistan,,2020-01-24,33.939110,67.709953,0
3,Afghanistan,,2020-01-25,33.939110,67.709953,0
4,Afghanistan,,2020-01-26,33.939110,67.709953,0
...,...,...,...,...,...,...
290151,Zimbabwe,,2022-10-17,-19.015438,29.154857,5606
290152,Zimbabwe,,2022-10-18,-19.015438,29.154857,5606
290153,Zimbabwe,,2022-10-19,-19.015438,29.154857,5606
290154,Zimbabwe,,2022-10-20,-19.015438,29.154857,5606


In [191]:
recuperados

Unnamed: 0,país,subregion,date,lat,long,recuperados
0,Afghanistan,,2020-01-22,33.939110,67.709953,0
1,Afghanistan,,2020-01-23,33.939110,67.709953,0
2,Afghanistan,,2020-01-24,33.939110,67.709953,0
3,Afghanistan,,2020-01-25,33.939110,67.709953,0
4,Afghanistan,,2020-01-26,33.939110,67.709953,0
...,...,...,...,...,...,...
275091,Zimbabwe,,2022-10-17,-19.015438,29.154857,0
275092,Zimbabwe,,2022-10-18,-19.015438,29.154857,0
275093,Zimbabwe,,2022-10-19,-19.015438,29.154857,0
275094,Zimbabwe,,2022-10-20,-19.015438,29.154857,0


In [192]:
print(len(confirmados))
print(len(muertos))
print(len(recuperados))

290156
290156
275096


### 3. Fusión (Merge) de los tres archivos en uno solo, eliminando columnas repetidas

In [193]:
muertos.drop(columns = ['lat','long'], inplace = True)
recuperados.drop(columns = ['lat','long'], inplace = True)

In [194]:
muertos

Unnamed: 0,país,subregion,date,muertes
0,Afghanistan,,2020-01-22,0
1,Afghanistan,,2020-01-23,0
2,Afghanistan,,2020-01-24,0
3,Afghanistan,,2020-01-25,0
4,Afghanistan,,2020-01-26,0
...,...,...,...,...
290151,Zimbabwe,,2022-10-17,5606
290152,Zimbabwe,,2022-10-18,5606
290153,Zimbabwe,,2022-10-19,5606
290154,Zimbabwe,,2022-10-20,5606


In [195]:
recuperados

Unnamed: 0,país,subregion,date,recuperados
0,Afghanistan,,2020-01-22,0
1,Afghanistan,,2020-01-23,0
2,Afghanistan,,2020-01-24,0
3,Afghanistan,,2020-01-25,0
4,Afghanistan,,2020-01-26,0
...,...,...,...,...
275091,Zimbabwe,,2022-10-17,0
275092,Zimbabwe,,2022-10-18,0
275093,Zimbabwe,,2022-10-19,0
275094,Zimbabwe,,2022-10-20,0


In [196]:
# left indica que confirmados se quedará igual
# agregará Nan para los valores desconocidos cuando no hayan datos (en este caso en recuperados)
covid_19 = (confirmados
           .merge(muertos, on = ['país','subregion','date'], how = 'left')
           .merge(recuperados, on = ['país','subregion','date'], how = 'left'))

In [197]:
covid_19

Unnamed: 0,país,subregion,date,lat,long,confirmados,muertes,recuperados
0,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0
1,Afghanistan,,2020-01-23,33.939110,67.709953,0,0,0.0
2,Afghanistan,,2020-01-24,33.939110,67.709953,0,0,0.0
3,Afghanistan,,2020-01-25,33.939110,67.709953,0,0,0.0
4,Afghanistan,,2020-01-26,33.939110,67.709953,0,0,0.0
...,...,...,...,...,...,...,...,...
290151,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893,5606,0.0
290152,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893,5606,0.0
290153,Zimbabwe,,2022-10-19,-19.015438,29.154857,257893,5606,0.0
290154,Zimbabwe,,2022-10-20,-19.015438,29.154857,257893,5606,0.0


In [198]:
len(covid_19)

290156

### 4. Realización de un Análisis Exploratorio de Datos.

In [199]:
covid_19.head()

Unnamed: 0,país,subregion,date,lat,long,confirmados,muertes,recuperados
0,Afghanistan,,2020-01-22,33.93911,67.709953,0,0,0.0
1,Afghanistan,,2020-01-23,33.93911,67.709953,0,0,0.0
2,Afghanistan,,2020-01-24,33.93911,67.709953,0,0,0.0
3,Afghanistan,,2020-01-25,33.93911,67.709953,0,0,0.0
4,Afghanistan,,2020-01-26,33.93911,67.709953,0,0,0.0


**Datos faltantes** 

In [200]:
pd.set_option('display.max_rows', 180)
(covid_19
.filter(['país'])
.drop_duplicates())

Unnamed: 0,país
0,Afghanistan
1004,Albania
2008,Algeria
3012,Andorra
4016,Angola
...,...
285136,West Bank and Gaza
286140,Winter Olympics 2022
287144,Yemen
288148,Zambia


 **Validando los datos**

In [201]:
covid_19.filter(['long','lat']).agg(['min','max'])

Unnamed: 0,long,lat
min,-178.1165,-71.9499
max,178.065,71.7069


In [202]:
covid_19.filter(['confirmados','muertes','recuperados']).describe()

Unnamed: 0,confirmados,muertes,recuperados
count,290156.0,290156.0,274092.0
mean,777661.3,12010.78,84742.38
std,4158791.0,58870.12,751295.3
min,0.0,0.0,-1.0
25%,432.0,2.0,0.0
50%,10339.5,115.0,0.0
75%,175264.8,2495.25,1599.0
max,97185560.0,1067673.0,30974750.0


In [203]:
covid_19.query('confirmados < 0 or muertes < 0 or recuperados < 0 ')

Unnamed: 0,país,subregion,date,lat,long,confirmados,muertes,recuperados
277065,United Kingdom,Pitcairn Islands,2022-09-13,-24.3768,-128.3242,4,0,-1.0
277066,United Kingdom,Pitcairn Islands,2022-09-14,-24.3768,-128.3242,4,0,-1.0
277067,United Kingdom,Pitcairn Islands,2022-09-15,-24.3768,-128.3242,4,0,-1.0
277068,United Kingdom,Pitcairn Islands,2022-09-16,-24.3768,-128.3242,4,0,-1.0
277069,United Kingdom,Pitcairn Islands,2022-09-17,-24.3768,-128.3242,4,0,-1.0
277070,United Kingdom,Pitcairn Islands,2022-09-18,-24.3768,-128.3242,4,0,-1.0
277071,United Kingdom,Pitcairn Islands,2022-09-19,-24.3768,-128.3242,4,0,-1.0
277072,United Kingdom,Pitcairn Islands,2022-09-20,-24.3768,-128.3242,4,0,-1.0


In [204]:
covid_19 = covid_19.query("not(confirmados < 0 or muertes < 0 or recuperados < 0 )")

In [205]:
print(len(covid_19))

290148


In [206]:
covid_19.filter(['confirmados','muertes','recuperados']).describe()

Unnamed: 0,confirmados,muertes,recuperados
count,290148.0,290148.0,274084.0
mean,777682.7,12011.11,84744.85
std,4158846.0,58870.89,751306.1
min,0.0,0.0,0.0
25%,432.0,2.0,0.0
50%,10343.0,115.0,0.0
75%,175270.0,2496.0,1599.0
max,97185560.0,1067673.0,30974750.0


In [207]:
covid_19.isnull().sum()

país                0
subregion      198792
date                0
lat              2008
long             2008
confirmados         0
muertes             0
recuperados     16064
dtype: int64

In [211]:
covid_19 = covid_19.fillna(value = {'subregion':'','lat':np.NaN,'long':np.NaN,'recuperados':0})

In [212]:
casos_nuevos = (covid_19
               .sort_values(by = ['país','subregion','date'])
               .filter(['país','subregion','date','confirmados'])
               .groupby(['país','subregion'])
               .diff())

In [213]:
casos_nuevos

Unnamed: 0,date,confirmados
0,NaT,
1,1 days,0.0
2,1 days,0.0
3,1 days,0.0
4,1 days,0.0
...,...,...
290151,1 days,0.0
290152,1 days,0.0
290153,1 days,0.0
290154,1 days,0.0


In [216]:
covid_19 = covid_19.assign(casos_nuevos = 'casos_nuevos')

In [217]:
covid_19

Unnamed: 0,país,subregion,date,lat,long,confirmados,muertes,recuperados,casos_nuevos
0,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0,casos_nuevos
1,Afghanistan,,2020-01-23,33.939110,67.709953,0,0,0.0,casos_nuevos
2,Afghanistan,,2020-01-24,33.939110,67.709953,0,0,0.0,casos_nuevos
3,Afghanistan,,2020-01-25,33.939110,67.709953,0,0,0.0,casos_nuevos
4,Afghanistan,,2020-01-26,33.939110,67.709953,0,0,0.0,casos_nuevos
...,...,...,...,...,...,...,...,...,...
290151,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893,5606,0.0,casos_nuevos
290152,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893,5606,0.0,casos_nuevos
290153,Zimbabwe,,2022-10-19,-19.015438,29.154857,257893,5606,0.0,casos_nuevos
290154,Zimbabwe,,2022-10-20,-19.015438,29.154857,257893,5606,0.0,casos_nuevos


In [219]:
covid_19['infectados'] = covid_19['confirmados'] - covid_19['muertes']-covid_19['recuperados']

In [220]:
covid_19

Unnamed: 0,país,subregion,date,lat,long,confirmados,muertes,recuperados,casos_nuevos,infectados
0,Afghanistan,,2020-01-22,33.939110,67.709953,0,0,0.0,casos_nuevos,0.0
1,Afghanistan,,2020-01-23,33.939110,67.709953,0,0,0.0,casos_nuevos,0.0
2,Afghanistan,,2020-01-24,33.939110,67.709953,0,0,0.0,casos_nuevos,0.0
3,Afghanistan,,2020-01-25,33.939110,67.709953,0,0,0.0,casos_nuevos,0.0
4,Afghanistan,,2020-01-26,33.939110,67.709953,0,0,0.0,casos_nuevos,0.0
...,...,...,...,...,...,...,...,...,...,...
290151,Zimbabwe,,2022-10-17,-19.015438,29.154857,257893,5606,0.0,casos_nuevos,252287.0
290152,Zimbabwe,,2022-10-18,-19.015438,29.154857,257893,5606,0.0,casos_nuevos,252287.0
290153,Zimbabwe,,2022-10-19,-19.015438,29.154857,257893,5606,0.0,casos_nuevos,252287.0
290154,Zimbabwe,,2022-10-20,-19.015438,29.154857,257893,5606,0.0,casos_nuevos,252287.0


### 5. Generación de nueva información calculando nuevas columnas como: