# Combine streams and lockdown data
The data we have so far is split in two tables (streaming data and lockdown stringency) with all the info per country in columns. In order to facilitate further investigation, we want to have one table per country with the number of streams and stringency factor combined per week.

In [1]:
# Import libraries
import os

import pandas as pd

In [2]:
# import the data
f1, f2 = [os.path.join("..", "02.cleaning", filename)
          for filename in
          ["aggregated_country_streams.pkl", "lockdown_clean.pkl"]]
st = pd.read_pickle(f1)
ld = pd.read_pickle(f2)

## Two functions to do the magic

In [3]:
def clean_index(df, mode):
    """Return a df with clean indices."""
    
    df.reset_index(inplace=True)
    
    if isinstance(df.columns, pd.MultiIndex):
        # We need to flatten the multi-index column names
        df.columns = df.columns.get_level_values(1)
    col_names = list(df.columns)
    
    # Depending on which df, we set the name of the last column
    col_names[0] = "date"
    col_names[-1] = mode
    df.columns = col_names
    
    return df

def focus_country(df, country, mode):
    """Return a df with only date, and specified country columns."""
    
    col_names = list(df.columns)
    
    # We want to keep the date and specified country columns
    if country == "EU":
        to_drop = col_names[1: -1]
    else:
        col_names.pop(0)
        col_names.remove(country)
        to_drop = col_names
    
    df.drop(columns=to_drop, inplace=True)
    
    return df.rename(columns={country: mode})

In [4]:
st = clean_index(st, "streams")
ld = clean_index(ld, "stringency")

In [5]:
# The correct dtype of datetime in ld was lost, which is why the merge failed
ld["date"] = pd.to_datetime(ld["date"])

## Merge and export

In [8]:
# Create a list of countries to loop over shortly
countries = "AUT, BEL, BGR, HRV, CYP, CZE, \
                DNK, EST, FIN, FRA, DEU, GRC, \
                HUN, IRL, ITA, LVA, LTU, LUX, \
                MLT, NLD, POL, PRT, ROU, SVK, \
                SVN, ESP, SWE, GBR"
countries = [c.strip() for c in countries.split(", ") if c.strip() not in ["HRV", "MLT", "SVN"]]
countries.append("EU")

In [9]:
# Merge and export tables for all the countries
for country in countries:
    # First get rid of countries we're not intested in. 
    s = focus_country(st.copy(), country, "streams")
    l = focus_country(ld.copy(), country, "stringency")
    
    # Then merge the two df's
    merg = pd.merge(s, l, how='left', on='date')
    
    # And export to csv and pickle
    path = os.path.join("merged_data", country)
    merg.to_csv(path + ".csv")
    merg.to_pickle(path + ".pkl")