# Introduction


# Import libraries

In [1]:
import sys
sys.path.append( '..' )

## Input
If running on colab, modify the below paths accordingly. 

In [2]:
from dataclasses import dataclass
from Class.DataMerger import *

@dataclass
class args:
    # folder where the cleaned feature file are at
    dataPath = '../../dataset_raw/CovidMay17-2022'
    supportPath = '../../dataset_raw/Support files'
    configPath = '../configurations/industry_groups.json'
    cachePath = None # '../2022_May_industry_groups/Total.csv'

    # choose this carefully
    outputPath = '../2022_May_industry_groups/'

In [3]:
# create output path if it doesn't exist
if not os.path.exists(args.outputPath):
    print(f'Creating output directory {args.outputPath}')
    os.makedirs(args.outputPath, exist_ok=True)

import json

# load config file
with open(args.configPath) as inputFile:
    config = json.load(inputFile)
    print(f'Config file loaded from {args.configPath}')
    inputFile.close()

Creating output directory ../2022_May_industry_groups/
Config file loaded from ../configurations/industry_groups.json


# Data merger

## Total features

In [4]:
# get merger class
dataMerger = DataMerger(config, args.dataPath, args.supportPath)

In [5]:
# if you have already created the total df one, and now just want to 
# reuse it to create different population or rurality cut
if args.cachePath:
    total_df = pd.read_csv(args.cachePath)
else:
    total_df = dataMerger.get_all_features()
    
    output_path_total = os.path.join(args.outputPath, 'Total.csv') 
    print(f'Writing total data to {output_path_total}\n')

    # rounding up to reduce the file size
    total_df.round(4).to_csv(output_path_total, index=False)

Merging feature Industry Groups.csv with length 3142

Merged static features have 3142 counties
Removing outliers from dynamic inputs.
Reading Vaccination.csv
Outliers found 266, percent 0.016
Min date 2020-12-13 00:00:00, max date 2022-05-17 00:00:00
Filtering out dynamic features outside range 2020-02-29 00:00:00 and 2022-05-17 00:00:00.
Length 1679704.

Total dynamic feature shape (1679704, 3)
Removing outliers from target.
Reading Cases.csv
Outliers found 88970, percent 3.312
Setting negative daily Cases counts to zero.
Min date 2020-01-22 00:00:00, max date 2022-05-25 00:00:00
Will filter out target data outside range 2020-02-29 00:00:00 and 2022-05-17 00:00:00.
Length 2541878.

Total target feature shape (2541878, 3)
Merging all features
Total merged data shape (2584600, 8)
Missing percentage in total data
VaccinationFull                      35.01
Health Care and Social Assistance     1.65
Manufacturing                         1.65
Retail Trade                          1.65
Acco

## Population cut

In [6]:
# you can define 'Population cut' in 'data'->'support'
# this means how many of top counties you want to keep

if dataMerger.need_population_cut():
    population_cuts = dataMerger.population_cut(total_df)
    for index, population_cut in enumerate(population_cuts):
        top_counties = dataMerger.data_config.population_cut[index]
        filename = f"Top_{top_counties}.csv"

        output_path_population_cut = os.path.join(args.outputPath, filename)

        print(f'Writing top {top_counties} populated counties data to {output_path_population_cut}.')
        population_cuts[index].round(4).to_csv(output_path_population_cut, index=False)

Slicing based on top 100 counties by population
Slicing based on top 500 counties by population
Writing top 100 populated counties data to ../2022_May_industry_groups/Top_100.csv.
Writing top 500 populated counties data to ../2022_May_industry_groups/Top_500.csv.
