# Should I wear a mask?
#### P.D. You should always use a mask while still there are covid cases, regardless the countries mandates
This process will analyze the covid cases from the last days to tell you the risk and if you should wear a mask or now regardless the current mandate that your country has.

The idea is to have a certain measure of the risk of not wearing a mask today, analyzing the changes of the data.



## Get the csv files from the last month

In [16]:
# install requirements NOT MANDATORY IF YOU ALREADY HAVE THE LIBS
%pip install pandas --quiet
%pip install memory_profiler --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
# import dependencies
import pandas as pd
from datetime import datetime, timedelta
from sys import getsizeof
from memory_profiler import profile
import logging
from math import trunc


In [2]:
# set logging level
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
def generate_date_range(days = 90):
    """ Generate a date range for the data extraction based on a number of days till today
    """
    date_range = pd.date_range(end=datetime.today()-timedelta(days=1), periods=days).tolist()
    return date_range


In [4]:
number_of_days = 30
date_range = generate_date_range(days=number_of_days)

In [5]:
def extract_covid_data(processed_date: datetime):
    file_name = ''.join([processed_date.strftime('%m-%d-%Y'), '.csv'])
    source_url = ''.join(['https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports_us/', file_name,'?raw=true']) 
    logger.debug(source_url)
    response  = pd.read_csv(source_url)
    return response

In [74]:
def merge_data(date_range: list):
    """ Given a list of dates, extracts all the data day by day, and creates a unified dataframe
    containing all the timeseries.
    """
    combined_dataframe = pd.DataFrame()
    for processed_date in date_range:
        new_dataframe = extract_covid_data(processed_date)
        combined_dataframe = pd.concat([combined_dataframe, new_dataframe])
    return combined_dataframe


covid_cases_df = merge_data(date_range)

In [75]:
covid_cases_df.loc[covid_cases_df['Province_State']=='District of Columbia'].head(10)

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Hospitalization_Rate
10,District of Columbia,US,2022-02-19 04:31:19,38.8974,-77.0268,133697,1314,,,11.0,18943.987168,3011192.0,,0.982819,84000011.0,USA,426666.137678,
10,District of Columbia,US,2022-02-20 04:31:11,38.8974,-77.0268,133697,1314,,,11.0,18943.987168,3029777.0,,0.982819,84000011.0,USA,429299.510166,
10,District of Columbia,US,2022-02-21 04:31:10,38.8974,-77.0268,133697,1314,,,11.0,18943.987168,3029777.0,,0.982819,84000011.0,USA,429299.510166,
10,District of Columbia,US,2022-02-22 04:31:49,38.8974,-77.0268,133697,1314,,,11.0,18943.987168,3029777.0,,0.982819,84000011.0,USA,429299.510166,
10,District of Columbia,US,2022-02-23 04:31:17,38.8974,-77.0268,134066,1315,,,11.0,18996.272046,3029777.0,,0.98086,84000011.0,USA,429299.510166,
10,District of Columbia,US,2022-02-24 04:31:31,38.8974,-77.0268,134114,1317,,,11.0,19003.073331,3040995.0,,0.982,84000011.0,USA,430889.027119,
10,District of Columbia,US,2022-02-25 04:31:14,38.8974,-77.0268,134214,1317,,,11.0,19017.242674,3046166.0,,0.981269,84000011.0,USA,431621.723871,
10,District of Columbia,US,2022-02-26 04:31:25,38.8974,-77.0268,134326,1317,,,11.0,19033.112339,3051706.0,,0.980451,84000011.0,USA,432406.7055,
10,District of Columbia,US,2022-02-27 04:31:23,38.8974,-77.0268,134326,1317,,,11.0,19033.112339,3059329.0,,0.980451,84000011.0,USA,433486.834554,
10,District of Columbia,US,2022-02-28 04:31:05,38.8974,-77.0268,134326,1317,,,11.0,19033.112339,3059329.0,,0.980451,84000011.0,USA,433486.834554,


In [72]:
covid_cases_filtered_df = covid_cases_df.filter(items=['Province_State', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Incident_Rate', 'Case_Fatility_Ratio'])
covid_cases_filtered_df.head(10)

Unnamed: 0,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate
0,Alabama,2022-02-19 04:31:19,32.3182,-86.9023,1271455,17877,,,25931.205941
1,Alaska,2022-02-19 04:31:19,61.3707,-152.4044,235956,1141,,,32254.475118
2,American Samoa,2022-02-19 04:31:19,-14.271,-170.132,18,0,,,32.350245
3,Arizona,2022-02-19 04:31:19,33.7298,-111.4312,1962920,27513,,,26967.939542
4,Arkansas,2022-02-19 04:31:19,34.9697,-92.3731,812948,10271,,,26938.396264
5,California,2022-02-19 07:32:06,36.1162,-119.6816,8892479,83858,,,22506.096405
6,Colorado,2022-02-19 04:31:19,39.0598,-105.3111,1301611,11681,,,22602.37316
7,Connecticut,2022-02-19 04:31:19,41.5978,-72.7554,719327,10357,,,20175.851201
8,Delaware,2022-02-19 04:31:19,39.3185,-75.5071,254719,2660,,,26158.186172
9,Diamond Princess,2022-02-19 04:31:19,,,49,0,,,


In [69]:
# Check the dataframe
covid_cases_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1740 entries, 0 to 57
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  1740 non-null   object 
 1   Last_Update     1740 non-null   object 
 2   Lat             1680 non-null   float64
 3   Long_           1680 non-null   float64
 4   Confirmed       1740 non-null   int64  
 5   Deaths          1740 non-null   int64  
 6   Recovered       0 non-null      float64
 7   Active          0 non-null      float64
 8   Incident_Rate   1680 non-null   float64
dtypes: float64(5), int64(2), object(2)
memory usage: 135.9+ KB


In [65]:
# Group dataframe by state, and take in consideration the incident rate (covid cases per 100K persons, to see the trends equally between states)
total_mean_us = covid_cases_filtered_df['Incident_Rate'].mean()
print(f'US covid cases of the last {number_of_days} days: {trunc(total_mean_us)} per 100,000 persons')


US covid cases of the last 30 days: 23530 per 100,000 persons


In [99]:
state_required = 'District of Columbia'

#vcovid_cases_filtered_df = covid_cases_filtered_df.set_index('Province_State')
covid_cases_dc_df = covid_cases_filtered_df.loc[covid_cases_filtered_df['Province_State'] == state_required]
# covid_cases_dc_df.groupby(by=['Province_State'])[['Incident_Rate']].agg()
mean_dc = covid_cases_dc_df['Incident_Rate'].mean()
std_dc = covid_cases_dc_df['Incident_Rate'].std()
median_dc = covid_cases_dc_df['Incident_Rate'].median()
print(f'STD in DC: {std_dc}')
print(f'Median in DC: {median_dc}')
print(f'The mean of the Washington DC cases in the last {number_of_days} has been {trunc(mean_dc)} per 100,000 persons\n\n')
quantile90 = covid_cases_dc_df['Incident_Rate'].quantile(.9)
print(f'The quantile 90 for DC is:{quantile90}')
# def get_covid_general_statistics(covid_df):
#     covid_df = 1
#     return

STD in DC: 46.938885408497775
Median in DC: 19075.19528897668
The mean of the Washington DC cases in the last 30 has been 19046 per 100,000 persons


The quantile 90 for DC is:19075.195288976684


In [102]:
last_update_dc = covid_cases_dc_df.sort_values(by=['Last_Update'], ascending=False).head(1)
print(last_update_dc)
if last_update_dc.loc[last_update_dc['Incident_Rate']>=quantile90]:
    print('ALERT, you should use a mask everywhere, the cases are high again.')
else:
    print('Cases seem low, take precautions but its not too dangerous outside')

          Province_State          Last_Update      Lat    Long_  Confirmed  \
10  District of Columbia  2022-03-20 04:32:01  38.8974 -77.0268     134623   

    Deaths  Recovered  Active  Incident_Rate  
10    1319        NaN     NaN   19075.195289  


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().