# Introduction
This notebook normalizes the industry groups. In the preprocessed industry occupation csv files in folder [Industry](../../dataset_additional/Industry/), the individual group distribution has again been min max scaled before dumping them in csv files. Which makes sum of all these groups > 1 for a single county. This may create confusion as sum of these percentage should be 1.00 for a county. Hence this notebook takes care of that and dumps the combined file [Industry Groups.csv](../../dataset_raw/CovidMay17-2022/Industry%20Groups.csv).

# List files

In [1]:
import pandas as pd
import os
root_dir = '../../dataset_additional/Industry/CovidOct25-2022-Industry/'
filenames = os.listdir(root_dir)
print(filenames)

['Accommodation and Food Services.csv', 'Administrative and Support and Waste Management and Remediation Services.csv', 'Agriculture, Forestry, Fishing and Hunting.csv', 'Arts, Entertainment, and Recreation.csv', 'Construction.csv', 'Educational Services.csv', 'Finance and Insurance.csv', 'Health Care and Social Assistance.csv', 'Information.csv', 'Management of Companies and Enterprises.csv', 'Manufacturing.csv', 'Mining, Quarrying, and Oil and Gas Extraction.csv', 'Other Services (except Public Administration).csv', 'Professional, Scientific, and Technical Services.csv', 'Retail Trade.csv', 'Transportation and Warehousing.csv', 'Utilities.csv', 'Wholesale Trade.csv']


# Combine

In [2]:
results = {}
industry_groups = [filename.split(".")[0] for filename in filenames]

for index, filename in enumerate(filenames):
    df = pd.read_csv(os.path.join(root_dir, filename), usecols=['FIPS', '2020-02-28'])

    # since this is static, all date column values are same for a county
    results[industry_groups[index]] =  df['2020-02-28']
    if index==0:
        results['FIPS'] = df['FIPS']

df = pd.DataFrame(results)
df.describe().T[['mean', 'std', 'min', 'max']]

Unnamed: 0,mean,std,min,max
Accommodation and Food Services,0.118235,0.073311,0.0,1.0
FIPS,30383.649268,15162.508374,1001.0,56045.0
Administrative and Support and Waste Management and Remediation Services,0.039657,0.051361,0.0,1.0
"Agriculture, Forestry, Fishing and Hunting",0.012396,0.044188,0.0,1.0
"Arts, Entertainment, and Recreation",0.019038,0.037505,0.0,1.0
Construction,0.062849,0.05438,0.0,1.0
Educational Services,0.029785,0.064419,0.0,1.0
Finance and Insurance,0.07472,0.055008,0.0,1.0
Health Care and Social Assistance,0.181913,0.090542,0.0,1.0
Information,0.040185,0.049887,0.0,1.0


In [3]:
df[industry_groups]

Unnamed: 0,Accommodation and Food Services,Administrative and Support and Waste Management and Remediation Services,"Agriculture, Forestry, Fishing and Hunting","Arts, Entertainment, and Recreation",Construction,Educational Services,Finance and Insurance,Health Care and Social Assistance,Information,Management of Companies and Enterprises,Manufacturing,"Mining, Quarrying, and Oil and Gas Extraction",Other Services (except Public Administration),"Professional, Scientific, and Technical Services",Retail Trade,Transportation and Warehousing,Utilities,Wholesale Trade
0,0.221336,0.030956,0.014815,0.048067,0.050072,0.029919,0.065747,0.146453,0.029320,0.000000,0.090950,0.008157,0.104672,0.035930,0.218022,0.011929,0.021662,0.021587
1,0.213641,0.077692,0.000360,0.037370,0.064661,0.047668,0.052266,0.132252,0.041450,0.018380,0.074363,0.000578,0.069682,0.044561,0.210527,0.024441,0.006214,0.038051
2,0.093948,0.041523,0.029791,0.007874,0.012968,0.017894,0.041787,0.082997,0.019232,0.000000,0.414405,0.011424,0.038713,0.016542,0.109798,0.191282,0.018584,0.020461
3,0.071607,0.029286,0.018906,0.003859,0.237356,0.047711,0.031547,0.176014,0.020687,0.000000,0.162070,0.000000,0.042313,0.023671,0.119930,0.118679,0.005504,0.024787
4,0.112812,0.060848,0.002939,0.011245,0.080262,0.003501,0.063912,0.152497,0.022200,0.000000,0.170317,0.000000,0.120871,0.047977,0.173008,0.050019,0.000000,0.060642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,0.118881,0.032389,0.000000,0.005653,0.048951,0.007342,0.039403,0.096423,0.034617,0.020934,0.136115,0.199118,0.047883,0.046497,0.148870,0.086662,0.038829,0.042832
3138,0.265430,0.047726,0.002864,0.214399,0.122191,0.065163,0.047102,0.083907,0.054053,0.006509,0.013208,0.000000,0.057025,0.080901,0.110188,0.029653,0.000000,0.012913
3139,0.090381,0.007984,0.000000,0.017069,0.307409,0.000000,0.041082,0.187987,0.108497,0.022241,0.028798,0.027016,0.023327,0.056824,0.136705,0.073844,0.016818,0.021391
3140,0.092172,0.008503,0.000000,0.011204,0.086279,0.000000,0.084175,0.257576,0.097638,0.000000,0.143198,0.016233,0.065049,0.062527,0.140572,0.057257,0.017889,0.042088


In [4]:
from sklearn.preprocessing import Normalizer

df[industry_groups] = Normalizer(norm='l1').fit_transform(df[industry_groups]) 
df = df[['FIPS']+industry_groups]
df.round(6).to_csv(
    '../../dataset_raw/CovidMay17-2022/Industry Groups.csv', 
    index=False
)

In [12]:
stats = df[industry_groups].describe().T[
    ['mean', 'std', 'min', 'max']
].sort_values(by='mean', ascending=False).reset_index(names=['industry'])
stats

Unnamed: 0,industry,mean,std,min,max
0,Health Care and Social Assistance,0.156155,0.081821,0.0,1.0
1,Manufacturing,0.13976,0.121996,0.0,0.933656
2,Retail Trade,0.138668,0.063174,0.0,1.0
3,Accommodation and Food Services,0.10175,0.067439,0.0,1.0
4,Finance and Insurance,0.062769,0.042355,0.0,0.666667
5,Other Services (except Public Administration),0.057838,0.033256,0.0,0.689196
6,Construction,0.054091,0.049938,0.0,1.0
7,Transportation and Warehousing,0.052677,0.055295,0.0,0.746887
8,"Professional, Scientific, and Technical Services",0.039754,0.039548,0.0,0.766291
9,Wholesale Trade,0.03828,0.043333,0.0,1.0


In [13]:
top_4_industries = stats['industry'].values[:4]
other_industries = stats['industry'].values[4:]

df['Others'] = df[other_industries].sum(axis=1)
df = df[['FIPS'] + list(top_4_industries) + ['Others']]
df.head(3)

In [20]:
df.round(6).to_csv(
    '../../dataset_raw/CovidMay17-2022/Industry Groups with others.csv', 
    index=False
)