# Ejercicio 1
Utiliza Pandas y el conjunto de datos público de COVID-19 proporcionado por la Universidad de Johns Hopkins para realizar las siguientes tareas: 
 
a)  Descarga los datos de COVID-19 en formato CSV o JSON desde la URL pública. 

b)  Carga los datos en un DataFrame de Pandas. 

c)  Calcula el promedio de casos confirmados por día en un país específico. 

d)  Encuentra los 10 países con la tasa de mortalidad más alta (número de muertes / número de casos confirmados) hasta la fecha.

## Importar librerias

In [1]:
import pandas as pd
import os
import re

## Cargar los datos

In [2]:
def load_dfs(path:str):
    dfs = []
    for archive in os.listdir(path):
        archive_path = os.path.join(path, archive)
        isFile = os.path.isfile(archive_path)
        if not isFile:
            dfs.extend(load_dfs(archive_path))
        else:
            if re.search(r".csv$", archive_path):
                dfs.append(pd.read_csv(archive_path))
    return dfs

def check_columns_df(valid_columns:set, df_columns:set):
    return len(valid_columns-df_columns)

def join_dfs(complete_df, df):
    if type(complete_df) == type(None): 
        return df
    return pd.concat([complete_df, df])

def load_dfs_df(dfs, amount_archives:int=1000000000000):
    complete_df = None
    big_df_columns = set()
    for df in dfs:
        df_columns = set(df.columns)
        if check_columns_df(big_df_columns, df_columns) != 0:
            difference_columns = list(valid_columns-df_columns)
            big_df_columns.update(difference_columns)
            for dif_col in difference_columns:
                complete_df[dif_col] = None
            del difference_columns
        complete_df = join_dfs(complete_df, df)
        del df
        amount_archives-=1
        if amount_archives == 0:
            return complete_df
    return complete_df


In [17]:
dfs = load_dfs("../data/DataEjercicio1/csse_covid_19_daily_reports")

In [18]:
complete_df = load_dfs_df(dfs, 10) # Yo lo sette a 10 files unicamente porque mi computadora tiene memoria limitada, sin embargo, se puede settear a infinito y pues se obtendria un dataframe completo

In [None]:
# complete_df[complete_df.Country_Region.str.contains(r"\bAngola\b", case=False, na=False)]

## Calcular el caso de confirmados por un pais dado

In [5]:
def get_avg_confirmed_case_by_country(df, country:str):
    return df[df.Country_Region.str.contains(country, case=False, na=False)].groupby("Last_Update")["Confirmed"].sum().mean()

In [6]:
get_avg_confirmed_case_by_country(complete_df, "Angola")

67839.22222222222

## Calcular los 10 paises con la tasa de mortalidad mas alta

In [7]:
complete_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Province/State,Country/Region,Last Update
0,,,,Afghanistan,2022-02-13 04:21:00,33.93911,67.709953,170152,7488,,,Afghanistan,437.090294,4.400771,,,
1,,,,Albania,2022-02-13 04:21:00,41.1533,20.1683,267551,3407,,,Albania,9297.067204,1.273402,,,
2,,,,Algeria,2022-02-13 04:21:00,28.0339,1.6596,261226,6715,,,Algeria,595.71217,2.570571,,,
3,,,,Andorra,2022-02-13 04:21:00,42.5063,1.5218,37140,148,,,Andorra,48068.336245,0.398492,,,
4,,,,Angola,2022-02-13 04:21:00,-11.2027,17.8739,98514,1898,,,Angola,299.741973,1.92663,,,


In [8]:
complete_df\
    .groupby(["Country_Region"])\
    .agg({"Deaths":"sum", "Confirmed":"sum"})\
    .apply(lambda r:r["Deaths"]/r["Confirmed"], axis=1)\
    .sort_values(ascending=False)\
    .iloc[:10]

Country_Region
Korea, North    6.000000
MS Zaandam      0.222222
Yemen           0.186818
Sudan           0.072193
Peru            0.065962
Mexico          0.061418
Syria           0.057930
Somalia         0.051289
Egypt           0.051207
Ecuador         0.044498
dtype: float64

In [10]:
complete_df[complete_df["Country_Region"]=="Korea, North"]

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Province/State,Country/Region,Last Update
4010,,,,"Korea, North",2022-02-13 04:21:00,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
4010,,,,"Korea, North",2022-04-17 04:20:59,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
4008,,,,"Korea, North",2021-05-03 04:20:39,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
4008,,,,"Korea, North",2021-02-12 05:23:29,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
367,,,,"Korea, North",2022-08-19 04:20:54,40.3399,127.5101,1,6,,,"Korea, North",0.003879,600.0,,,
4010,,,,"Korea, North",2021-11-27 04:22:45,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
4010,,,,"Korea, North",2022-02-15 04:21:09,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",0.0,0.0,,,
367,,,,"Korea, North",2022-05-24 04:20:57,40.3399,127.5101,1,6,,,"Korea, North",0.003879,600.0,,,
3109,,,,"Korea, North",2020-04-20 23:36:47,40.3399,127.5101,0,0,0.0,0.0,"Korea, North",,,,,
