In [8]:
import pandas as pd
import io, os, sys, types


from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell


In [10]:
def import_data(dir_name):
    col_to_rename = {"Country/Region":"Country_Region",
                     "Province/State":"Province_State",
                     "Last Update":"Last_Update",
                     "Long_":"Longitude",
                     "Lat":"Latitude"
                    }
    df_cols = ['Province_State', 'Country_Region', 'Last_Update', 'Confirmed','Deaths', 'Recovered','Active']
    df = pd.DataFrame(columns = df_cols)
    for file_name in os.listdir(dir_name):
        if file_name.endswith(".csv"):
            tmp = pd.read_csv(os.path.join(dir_name, file_name)).rename(columns=col_to_rename)
            tmp["Date"] = file_name[-14:-4]
            df = pd.concat(
                [df, tmp], 
                axis=0, 
                join='outer'
                          )
            
    return df

def preprocess_data(df):
    
    df.Deaths = df.Deaths.fillna(0)
    df.Confirmed = df.Confirmed.fillna(0)
    df.Recovered = df.Recovered.fillna(0)
    df.Active = df.Active.fillna(0)
    df.Province_State = df.Province_State.fillna("")  

    df.Last_Update = pd.to_datetime(pd.to_datetime(df.Last_Update).map(lambda x: x.strftime('%Y-%m-%d'))) 
    df = df.drop_duplicates(subset=["Admin2","Province_State","Country_Region","Date"], keep="first")
    
    return df


def clean_country_names(df):
    country_names_mistakes = {
                            "Mainland China":"China",
                            "Viet Nam":"Vietnam", 
                            "Taiwan*":"Taiwan", 
                            "Hong Kong SAR":"Hong Kong,", 
                            "Gambia, The":"Gambia",
                            "Guinea-Bissau":"Guinea", 
                            "Czechia":"Czech Republic",
                            "Bahamas The":"Bahamas",
                            "Korea, South":"South Korea"
                             }
    df.Country_Region = df.Country_Region.replace(country_names_mistakes)
        
    return df
        
    
def get_per_country_data(df):
    return df.groupby(["Country_Region","Date"])["Confirmed","Deaths","Recovered"].sum().reset_index()

def get_specific_countries(df_per_country, list_of_countries):
    return df_per_country[df_per_country.Country_Region.isin(list_of_countries)]


In [18]:
df = import_data("..\csse_covid_19_data\csse_covid_19_daily_reports")
df = clean_country_names(preprocess_data(df))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## US map of confirmed cases

In this notebook we would like to study the number of confirmed cases per state in the US.
We will also correlate the spread of the desease with the size of the population per state

In [19]:
import json 


def get_states_data(df_usa):
    print("1")
    df_usa = df[df.Country_Region == "US"]
    
    city_to_code = pd.read_csv("../additional_data/us_city_to_code.csv")
    city_to_code = city_to_code.loc[:,["city","state_abbr"]].set_index("city").to_dict()["state_abbr"]
    
    with open("../additional_data/us_state_to_code.json") as json_file:
        province_to_code = dict([[v,k] for k,v in json.load(json_file).items()])
    
    # get state code using province_state name 
    df_usa["Code"] = [el.split(",")[-1].strip() for el in df_usa.Province_State]
    df_usa["Code"] = df_usa["Code"].replace(city_to_code)
    df_usa["Code"] = df_usa["Code"].replace(province_to_code)
    
    print("1")
    error = []
    for province in df_usa.Code:
        if len(province) > 2:
            error.append(province)
    error = list(set(error))

    df_usa = df_usa[~ df_usa.Code.isin(error)]

    code_province = pd.read_csv("https://raw.githubusercontent.com/scpike/us-state-county-zip/master/geo-data.csv")
    code_province = code_province.loc[:,["city","state_abbr"]].set_index("city").to_dict()["state_abbr"]
    df_usa = df_usa.groupby(["Code","Date"])["Confirmed"].sum().reset_index().sort_values(by="Date")
    
    return df_usa

df_usa = get_states_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
df_usa

Unnamed: 0,Code,Date,Confirmed
1574,VA,01-22-2020,1.0
1575,VA,01-23-2020,1.0
437,IL,01-24-2020,1.0
1576,VA,01-24-2020,1.0
438,IL,01-25-2020,1.0
...,...,...,...
519,IL,04-15-2020,38973.0
761,MN,04-15-2020,1809.0
1338,PR,04-15-2020,974.0
566,KY,04-15-2020,2210.0
