## 02 Data Processing

#### Import relevant libraries

In [2]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

#### Load raw data

In [3]:
input_dir = os.path.join('..', 'data', 'original')
exp_df = pd.read_csv(os.path.join(input_dir, 'expenditure_bills_burden.csv'))
hui_df = pd.read_csv(os.path.join(input_dir, 'housing_units_income.csv'))
aei_df = pd.read_csv(os.path.join(input_dir, 'assets_earnings_investments.csv'))

In [4]:
exp_df.head()

Unnamed: 0,parent_name,utility_name,respondent_id,year,percent_AMI,ownership,electricity_gas_other,technology,expenditure,bill,burden
0,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,Electricity,adjustment,9276270.0,8.394506,0.009773
1,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,Electricity,distribution,30175700.0,27.307328,0.031792
2,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,Electricity,hydro,4312818.0,3.90286,0.004544
3,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,Electricity,nuclear,19129600.0,17.311221,0.020154
4,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,Electricity,other,15780840.0,14.280782,0.016626


In [5]:
hui_df.head()

Unnamed: 0,parent_name,utility_name,respondent_id,year,percent_AMI,ownership,housing_units,income
0,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,owner,92086.710664,949151400.0
1,Southern Co.,Alabama Power Co.,2.0,2020,0-30%,renter,126159.321739,1238083000.0
2,Southern Co.,Alabama Power Co.,2.0,2020,100%+,owner,521425.299562,60490730000.0
3,Southern Co.,Alabama Power Co.,2.0,2020,100%+,renter,111421.757248,9494001000.0
4,Southern Co.,Alabama Power Co.,2.0,2020,30-60%,owner,108469.889403,2541532000.0


In [6]:
aei_df.head()

Unnamed: 0,parent_name,utility_name,respondent_id,year,asset,sub_asset,asset_value,earnings_value,investment_value
0,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2020,other,AROs,-15405378.0,-0.0,
1,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2020,other,electric_plant_held_for_future_use,1034099.0,0.0,
2,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2020,other,electric_plant_leased_to_others,155616036.0,0.0,
3,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2020,other,general_plant,427318.0,0.0,1846.0
4,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2020,other,intangible_plant,724784.0,0.0,


#### Formatting data

(a) Rename columns

In [7]:
exp_df = exp_df.rename({'percent_AMI': 'percent_ami'}, axis='columns')
hui_df = hui_df.rename({'percent_AMI': 'percent_ami'}, axis='columns')

(b) Data type formatting

In [8]:
exp_df.dtypes

parent_name               object
utility_name              object
respondent_id            float64
year                       int64
percent_ami               object
ownership                 object
electricity_gas_other     object
technology                object
expenditure              float64
bill                     float64
burden                   float64
dtype: object

In [9]:
hui_df.dtypes

parent_name       object
utility_name      object
respondent_id    float64
year               int64
percent_ami       object
ownership         object
housing_units    float64
income           float64
dtype: object

In [10]:
aei_df.dtypes

parent_name          object
utility_name         object
respondent_id       float64
year                  int64
asset                object
sub_asset            object
asset_value         float64
earnings_value      float64
investment_value    float64
dtype: object

#### Join datasets

Prepare expenditure_bills_burden.csv data frame

In [11]:
# make a copy
exp_prepared_df = exp_df.copy()

In [12]:
# groupby utility x year
exp_prepared_df = exp_prepared_df.groupby(['respondent_id', 'year', 'technology', 'electricity_gas_other'])\
    .agg({'bill': ['sum'], 'expenditure': ['sum']})\
    .reset_index()

Prepare housing_units_income.csv data frame

In [13]:
# make a copy
hui_prepared_df = hui_df.copy()

In [14]:
# drop repeated columns
hui_prepared_df = hui_prepared_df.drop(columns=[
    'parent_name', 
    'utility_name', 
    'ownership',
])

In [15]:
# groupby utility x year
hui_prepared_df = hui_prepared_df.groupby(['respondent_id', 'year'])\
    .agg({'housing_units': ['sum'], 'income': ['sum']})\
    .reset_index()

Prepare assets_earnings_income.csv data frame

In [16]:
# make a copy
aei_prepared_df = aei_df.copy()

In [17]:
# drop repeated columns
aei_prepared_df = aei_prepared_df.drop(columns=[
    'parent_name', 
    'utility_name', 
])

In [18]:
# groupby utility x year
aei_prepared_df = aei_prepared_df.groupby(['respondent_id', 'year', 'asset', 'sub_asset'])\
    .agg({'asset_value': ['sum'], 'earnings_value': ['sum'], 'investment_value': ['sum']})\
    .reset_index()

Join datasets into one dataframe

In [19]:
raw_df = pd.merge(exp_prepared_df, hui_prepared_df, how='inner', on=['respondent_id', 'year'])
raw_df = pd.merge(raw_df, aei_prepared_df, how='inner', on=['respondent_id', 'year'])
raw_df.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0_level_0,respondent_id,year,technology,electricity_gas_other,bill,expenditure,housing_units,income,asset,sub_asset,asset_value,earnings_value,investment_value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,sum,sum,sum,sum,Unnamed: 9_level_1,Unnamed: 10_level_1,sum,sum,sum
0,2.0,2005,Gas,Gas,375.031464,580113200.0,1177707.0,60027050000.0,distribution,distribution,2662308000.0,135398100.0,256240572.0
1,2.0,2005,Gas,Gas,375.031464,580113200.0,1177707.0,60027050000.0,hydro,hydro,630636600.0,32072540.0,6223361.0
2,2.0,2005,Gas,Gas,375.031464,580113200.0,1177707.0,60027050000.0,nuclear,nuclear,883597600.0,44937480.0,46613178.0
3,2.0,2005,Gas,Gas,375.031464,580113200.0,1177707.0,60027050000.0,other,AROs,-446267800.0,-22696020.0,0.0
4,2.0,2005,Gas,Gas,375.031464,580113200.0,1177707.0,60027050000.0,other,construction_work_in_progress,-403559.4,-20523.98,0.0


In [20]:
raw_df = raw_df.drop(columns=['electricity_gas_other'])

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [21]:
pivoted = raw_df.pivot(index=['respondent_id', 'year'], columns="technology", values=['expenditure'])

ValueError: Index contains duplicate entries, cannot reshape

#### Data cleaning

(a) Impute values for missing data

In [None]:
# df.fillna(0)
# 

(b) Remove probable outliers

(c) Compute consumer energy burden

In [None]:
# burden = expenditure / income
# normalized expenditure = expenditure / sales

#### Save processed_data file

In [97]:
output_dir = os.path.join('..', 'data', 'processed')

In [98]:
raw_df.to_csv(os.path.join(output_dir, 'processed_data.csv'))