# Formatting CARB County-Level Emissions Data

### 0. Import Packages

In [1]:
import pandas as pd

**Functionalizing County Data Formatting:**

Function should accept a county name and perform the following:

- Load and format each csv into a df within a dictionary
- Merge all emissions into a single df
- Create grouping df by melting emission-specific series
- Add population, TpDpCap columns to grouping df
- Save county_all_ems df and county_grouping_df to csv's

### 1. Variables for Data Formatting

In [2]:
# emission types list
em_types = ['co', 'nh3', 'nox', 'pm', 'pm10', 'pm2_5', 'rog', 'sox', 'tog']

# initialize dictionary for individual emissions df's
em_df_dict = {}

In [3]:
# county: 
popfile_col_dict = {'los_angeles': 'CALOSA7POP',
                    'yuba': 'CAYUBA5POP',
                    'kern': 'CAKERN0POP',
                    'sacramento': 'CASACR5POP',
                    'riverside': 'CARIVE5POP',
                    'alpine': 'CAALPI3POP',
                    'santa_clara': 'CASANT5POP',
                    'marin': 'CAMARI5POP',
                    'humboldt': 'CAHUMB0POP',
                    'san_diego': 'CASAND5POP'
                   }

### 2. Data Formatting Function

In [4]:
def format_county_data(county): # use _ in place of space in county name
    
    # LOAD AND FORMAT EACH EMISSION-SPECIFIC CSV INTO A DF WITHIN A DICTIONARY
    
    for em in em_types:
        # load data
        em_df = pd.read_csv(f'../data/CARB Emissions Data/counties/{county}/{county}_{em}.csv')

        # lowercase columns; drop unnecessary columns
        em_df.columns = [col.lower() for col in em_df.columns]
        em_df.drop(columns=['area', 'pollutant', 'season', 'control_type', 'v1.03_rf3084'], inplace=True)

        # melt years and cast year as integer
        em_df = pd.melt(em_df, id_vars=['src_type', 'category', 'subcategory'], var_name='year', value_name=f'{em}')
        em_df.year = [int(year) for year in em_df.year]

        # save df in dict
        em_df_dict[f'{em}'] = em_df
        
    # MERGE ALL EMISSIONS INTO A SINGLE DF
    
    # start with carbon monoxide to initialize df for merge
    all_em_df = em_df_dict['co']
    
    # merge remaining df's in one at a time, matching on all keys to ensure data integrity
    for em in em_types[1:]:
        all_em_df = pd.merge(all_em_df, em_df_dict[f'{em}'], on=['src_type', 'category', 'subcategory', 'year'])
    
    # CREATE GROUPING DF BY MELTING EMISSION-SPECIFIC SERIES
    
    # melt separate emissions columns into 'em_type' for grouping, plotting
    grouping_df = pd.melt(all_em_df, id_vars=['src_type', 'category', 'subcategory', 'year'],
                          value_vars=em_types, var_name='em_type', value_name='tons_per_day')
    
    # ADD POPULATION, TPDPCAP COLUMNS TO GROUPING DF

    # read in population data
    pop_yearly = pd.read_csv(f'../data/FRED/{county}_POP.csv')
    
    # rename columns
    pop_yearly = pop_yearly.rename(columns={'DATE': 'year', popfile_col_dict[county]: 'population'})

    # convert date to year
    pop_yearly['year'] = list(range(2000, 2022))

    # convert population from thousands to ones
    pop_yearly['population'] = pop_yearly['population'] * 1000
    
    # merge population column into grouping df
    grouping_df = pd.merge(grouping_df, pop_yearly, on='year')
    
    # add TpDpCap column
    grouping_df['TpDpCap'] = grouping_df['tons_per_day'] / grouping_df['population']
    
    # FILL NA'S WITH 0 FOR BOTH DF'S: MISSING VALUES INDICATE NO EMISSIONS
    
    all_em_df.fillna(0, inplace=True)
    grouping_df.fillna(0, inplace=True)
    
    # SAVE COUNTY_ALL_EMS DF AND COUNTY_GROUPING DF TO CSV
    
    all_em_df.to_csv(f'../data/CARB Emissions Data/counties/{county}/{county}_all_ems.csv', index=False)
    grouping_df.to_csv(f'../data/CARB Emissions Data/counties/{county}/{county}_grouping.csv', index=False)
    
    return None

### 3. Format Data for Each County

In [5]:
for county in popfile_col_dict.keys():
    format_county_data(county)