# Import pre-processed datasets

This notebook imports the preprocessed datasets which come with this repository.

This is now deprecated into [`data_access.py`](brsflufight_nerc2/data_access.py) (which was created by exporting this file).

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import os
import numpy as np

In [2]:
data_dir = "data/processed"

data_files = os.listdir(data_dir)

In [3]:
print(data_files)

['historical_GHG_Sectors_GCP.csv', 'historical_GHG_Sectors_PIK.csv', 'historical_GHG_Sectors_UNFCCC.csv', 'mobility_apple.csv', 'mobility_citymapper.csv', 'mobility_google.csv', 'uk_energy_daily.csv']


## Define import processes

The idea is that all data sets are read and will all be indexed by a DateTime series and possibly other indices if helpful.


In [4]:
def read_multi_indexed_csv(file_in, first_data_column, dir_in=data_dir):
    ''' Read in preprocessed historical GHG data.

    Arguments:
        file_in (str): Name of file
        first_data_column (str|int) : index or name of first column with data
        dir_in (str): Directory of file
    
    Returns
        (DataFrame, list): The data of the file loaded in a dataframe
        and a list indicating which columns contain the data of interest
    '''
    if not (
        isinstance(first_data_column, type(str()))
        or isinstance(first_data_column, type(int()))
    ) :
        raise TypeError("`first_data_column` must be a column name or an int")

    file_path = os.path.join(dir_in, file_in)
    df = pd.read_csv(file_path)
    index_cols = [c for c in df.columns if c + '.1' in df.columns]
    
    for col in ["date", "Date", "timestamp"]:
        if col in df:
            df[col] = pd.to_datetime(df[col])
    
    if index_cols:
        df.drop([f"{c}.1" for c in index_cols], axis=1, inplace=True)
        df.set_index(
            pd.MultiIndex.from_frame(df[index_cols]), inplace=True
        )

    if isinstance(first_data_column, type(str())):
        first_data_column = df.columns.get_loc(first_data_column)
        
    data_fields = df.columns[first_data_column:]
    return (df, data_fields)

### historical green house gas emissions


define `read_historical_GHG`

In [5]:
def read_historical_GHG(file_in, dir_in=data_dir):
    ''' Read in preprocessed historical GHG data.

    Arguments:
        file_in (str): Name of file
        dir_in (str): Directory of file
    
    Returns
        (DataFrame, list): The data of the file loaded in a dataframe
        and a list indicating which columns contain the data of interest
    '''
    return read_multi_indexed_csv(file_in, 6, dir_in=data_dir)

### Read mobility data


define `read_mobility_file`

In [6]:
def read_mobility_google(file_in, dir_in=data_dir):
    return read_multi_indexed_csv(
        file_in, 
        'retail_and_recreation_percent_change_from_baseline',
        dir_in=data_dir
    )

def read_mobility_apple(file_in, dir_in=data_dir):
    return read_multi_indexed_csv(
        file_in, 
        'driving',
        dir_in=data_dir
    )

def read_mobility_citymapper(file_in, dir_in=data_dir):
    df, col = read_multi_indexed_csv(
        file_in, 1, dir_in=data_dir
    )
    return df.set_index('Date', drop=False), col

### load uk energy


define `read_uk_energy`

In [7]:
def read_uk_energy(file_in, dir_in=data_dir):
    df, col = read_multi_indexed_csv(
        file_in, 1, dir_in=data_dir
    )
    return df.set_index('timestamp', drop=False), col

## Read in the data

Now we map each file to the appropriate function

In [8]:
# Map files to a function that will read it properly
file_read_functions = {
    'historical_GHG_Sectors_GCP.csv': read_historical_GHG,
    'historical_GHG_Sectors_PIK.csv': read_historical_GHG,
    'historical_GHG_Sectors_UNFCCC.csv': read_historical_GHG,
    'mobility_apple.csv': read_mobility_apple,
    'mobility_citymapper.csv': read_mobility_citymapper,
    'mobility_google.csv': read_mobility_google,
    'uk_energy_daily.csv': read_uk_energy,
}

And with a single loop all the different data sets are loaded.

In [9]:
data_files = os.listdir(data_dir)

data_sets = {}
data_columns = {}
for data_file in data_files:
    data_name, _ = os.path.splitext(data_file)
    data_sets[data_name], data_columns[data_name] = \
        file_read_functions[data_file](data_file)

### Summary of available data

In [16]:
for data in data_sets:
    print("=========================================")
    print(data)
    print("--------------------------------------")
    print(data_columns[data])
    print("--------------------------------------")
    print(data_sets[data].info())
    print("--------------------------------------")
    print()

historical_GHG_Sectors_GCP
--------------------------------------
Index(['Bunkers', 'Cement', 'Coal', 'Gas', 'Gas flaring', 'Oil',
       'Total fossil fuels and cement'],
      dtype='object')
--------------------------------------
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11502 entries, ('Afghanistan', 'CO2', Timestamp('1960-01-01 00:00:00')) to ('Zimbabwe', 'CO2', Timestamp('2018-01-01 00:00:00'))
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Country                        11502 non-null  object        
 1   GH_Gas                         11502 non-null  object        
 2   date                           11502 non-null  datetime64[ns]
 3   Data source                    11502 non-null  object        
 4   Unit                           11502 non-null  object        
 5   max_year                       11502 non-null  int64         
 6   Bunkers       