<a href="https://colab.research.google.com/github/Arohrba5/Synthetic-Control-Causal-Inference-with-Covid-Data/blob/main/MITx_IDS_S24x_SyntheticInterventionRecitation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load Data

In [24]:
# Define GitHub raw URLs for your data files
data_file_url_1 = "https://raw.githubusercontent.com/Arohrba5/Synthetic-Control-Causal-Inference-with-Covid-Data/main/data_global_data.csv"
data_file_url_2 = "https://raw.githubusercontent.com/Arohrba5/Synthetic-Control-Causal-Inference-with-Covid-Data/main/data_global_mobility_report.csv"

# Load the data into pandas DataFrames
covid_df = pd.read_csv(data_file_url_1)
mobility_df = pd.read_csv(data_file_url_2)

# Display the first few rows of each dataset
print(covid_df.head())
print(mobility_df.head())

       country      date  deaths  daily_deaths
0  Afghanistan  01/23/20       0           0.0
1  Afghanistan  01/24/20       0           0.0
2  Afghanistan  01/25/20       0           0.0
3  Afghanistan  01/26/20       0           0.0
4  Afghanistan  01/27/20       0           0.0
  country_region_code        country_region  sub_region_1  sub_region_2  \
0                  AE  United Arab Emirates           NaN           NaN   
1                  AE  United Arab Emirates           NaN           NaN   
2                  AE  United Arab Emirates           NaN           NaN   
3                  AE  United Arab Emirates           NaN           NaN   
4                  AE  United Arab Emirates           NaN           NaN   

  metro_area  iso_3166_2_code  census_fips_code                     place_id  \
0        NaN              NaN               NaN  ChIJvRKrsd9IXj4RpwoIwFYv0zM   
1        NaN              NaN               NaN  ChIJvRKrsd9IXj4RpwoIwFYv0zM   
2        NaN              N

Data Preprocessing - Covid Dataset

In [25]:
# Examine covid dataset
print(covid_df.head())

       country      date  deaths  daily_deaths
0  Afghanistan  01/23/20       0           0.0
1  Afghanistan  01/24/20       0           0.0
2  Afghanistan  01/25/20       0           0.0
3  Afghanistan  01/26/20       0           0.0
4  Afghanistan  01/27/20       0           0.0


In [26]:
# Examine US rows of covid dataset
print(covid_df[covid_df['country'] == 'US'].head(5))

       country      date  deaths  daily_deaths
201996      US  01/23/20       0           0.0
201997      US  01/24/20       0           0.0
201998      US  01/25/20       0           0.0
201999      US  01/26/20       0           0.0
202000      US  01/27/20       0           0.0


In [27]:
# Convert dates into datatime object
covid_df['date'] = pd.to_datetime(covid_df['date'])
print(covid_df.head())

  covid_df['date'] = pd.to_datetime(covid_df['date'])


       country       date  deaths  daily_deaths
0  Afghanistan 2020-01-23       0           0.0
1  Afghanistan 2020-01-24       0           0.0
2  Afghanistan 2020-01-25       0           0.0
3  Afghanistan 2020-01-26       0           0.0
4  Afghanistan 2020-01-27       0           0.0


In [28]:
# Convert US to United States, to match other dataset
covid_df = covid_df.map(lambda x: "United States" if x == 'US' else x)
print(covid_df.loc[201996:202000])

              country       date  deaths  daily_deaths
201996  United States 2020-01-23       0           0.0
201997  United States 2020-01-24       0           0.0
201998  United States 2020-01-25       0           0.0
201999  United States 2020-01-26       0           0.0
202000  United States 2020-01-27       0           0.0


In [29]:
# Establish time steps relative to determine t = 0 for each country (i.e., when country reaches 80 cumulative deaths)

pre_intervention_cum_deaths = 80
covid_df['post-intervention'] = covid_df['deaths'] >= pre_intervention_cum_deaths
covid_df.head()

Unnamed: 0,country,date,deaths,daily_deaths,post-intervention
0,Afghanistan,2020-01-23,0,0.0,False
1,Afghanistan,2020-01-24,0,0.0,False
2,Afghanistan,2020-01-25,0,0.0,False
3,Afghanistan,2020-01-26,0,0.0,False
4,Afghanistan,2020-01-27,0,0.0,False


In [30]:
# Identify t=0 for each country
t0_date = covid_df[covid_df['post-intervention']].groupby('country')['date'].min()
t0_date

Unnamed: 0_level_0,date
country,Unnamed: 1_level_1
Afghanistan,2020-05-06
Albania,2020-07-07
Algeria,2020-04-02
Andorra,2020-12-18
Angola,2020-08-11
...,...
Vietnam,2021-06-29
West Bank and Gaza,2020-07-29
Yemen,2020-05-31
Zambia,2020-07-17


In [31]:
# Match T=0 date to each country
covid_df = pd.merge(covid_df, t0_date.reset_index(), on='country', how='left').rename(columns={'date_x':'date', 'date_y':'t0_date'})

In [32]:
# Calculate the difference between the T=0 date and the existing date, to figure out the time step of each date (e.g., T=-5)
covid_df['t'] = (covid_df['date'] - covid_df['t0_date']).dt.days

In [34]:
# Only need data for T = -20 to 20
covid_df = covid_df[(covid_df['t'] >= -20) & (covid_df['t'] <= 20)]
covid_df.head()

Unnamed: 0,country,date,deaths,daily_deaths,post-intervention,t0_date,t
84,Afghanistan,2020-04-16,29,4.0,False,2020-05-06,-20.0
85,Afghanistan,2020-04-17,30,1.0,False,2020-05-06,-19.0
86,Afghanistan,2020-04-18,30,0.0,False,2020-05-06,-18.0
87,Afghanistan,2020-04-19,30,0.0,False,2020-05-06,-17.0
88,Afghanistan,2020-04-20,33,3.0,False,2020-05-06,-16.0


Data Preprocessing - Mobility Data

We need to assign mobility reductions each each country's time step.
Assume that mobility reductions impact future mortality cases with a lag of 20 days

In [35]:
# Examine data frame
mobility_df.head()


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0


In [38]:
# Only include country level data (i.e., only include rows with null sub_regions and metro areas)
mobility_df = mobility_df[mobility_df['sub_region_1'].isnull() &
                          mobility_df['sub_region_2'].isnull() &
                          mobility_df['metro_area'].isnull()]
mobility_df.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0


In [41]:
# Rename country_region to country
mobility_df = mobility_df.rename(columns={'country_region' : 'country'})

In [42]:
# Make sure data is in date time format
mobility_df['date'] = pd.to_datetime(mobility_df['date'])

In [48]:
# Match covid data to mobility scores on data only for dates that matter (t is between -20 and 20)
# Note: I don't entirely understand this step. Seems like the matcing mobility score to covid data would have made more sense.
mobility_df = pd.merge(mobility_df, covid_df[covid_df['t'] < 0], on=['country', 'date'], how='right')

In [51]:
# Check the t dates fall within the correct range.
mobility_df[['country', 'date', 't', 'retail_and_recreation_percent_change_from_baseline']].head()


Unnamed: 0,country,date,t,retail_and_recreation_percent_change_from_baseline
0,Afghanistan,2020-04-16,-20.0,-46.0
1,Afghanistan,2020-04-17,-19.0,-41.0
2,Afghanistan,2020-04-18,-18.0,-43.0
3,Afghanistan,2020-04-19,-17.0,-43.0
4,Afghanistan,2020-04-20,-16.0,-42.0


In [54]:
# Group countries by score. Low, moderate, or severe.
country_mobility_scores = mobility_df.groupby('country')['retail_and_recreation_percent_change_from_baseline'].mean()

low_countries = []
moderate_countries = []
severe_countries = []

for country, score in country_mobility_scores.items():
  if(not pd.isnull(score)):
    if(score > -10):
      low_countries.append(country)
    elif (score > -40):
      moderate_countries.append(country)
    else:
      severe_countries.append(country)

len(low_countries), len(moderate_countries), len(severe_countries)

(23, 50, 50)

Shape data