## 03 Feature Engineering

#### Import relevant libraries

In [1]:
import os
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# %matplotlib inline

#### Load processed dataset

In [2]:
input_dir = os.path.join('..', 'data', 'processed')
data_df = pd.read_csv(os.path.join(input_dir, 'processed_data.csv'))
data_df

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/processed_data.csv'

In [None]:
is_after_2010 = data_df['year'] >= 2010  #inclusive
data_after_2010_df = data_df[is_after_2010]
data_after_2010_df


#### Create new features

In [None]:
data_df.columns

In [None]:
# Target variable
normalized_expenditure = data_df[['normalized_expenditure', 'year', 'respondent_id']]
normalized_expenditure

In [None]:
pivot_df = normalized_expenditure.pivot('respondent_id', 'year', 'normalized_expenditure')
pivot_df

In [None]:
#t = 10
pivot_df['normalized_expenditure_cagr'] = (pivot_df[2020]/pivot_df[2010])**0.1 - 1
pivot_df

In [None]:
target = pivot_df['normalized_expenditure_cagr']
target

0. Transmission Investments (as % of investments)
1. Total Investments / sales
2. Renewable Investments (as % of investments)
3. 
4. 

In [None]:
data_after_2010_df.head()

#### Create DataFrames for Investments

In [None]:
investment_column_names = ['respondent_id', 'investment_value_distribution',
       'investment_value_hydro', 'investment_value_nuclear',
       'investment_value_other', 'investment_value_other_fossil',
       'investment_value_renewables', 'investment_value_steam',
       'investment_value_transmission', 'sales']

In [None]:
# Create a DataFranme of utility investments

sum_investments_sales_df = data_after_2010_df[investment_column_names].groupby(by=["respondent_id"]).sum()
investment_column_names.remove('respondent_id')
sum_investments_sales_df

In [None]:
sum_investments_sales_df['sum_investments'] = sum_investments_sales_df[investment_column_names].sum(axis=1)
sum_investments_sales_df['sum_investments_generation'] = sum_investments_sales_df['sum_investments'] - sum_investments_sales_df['investment_value_transmission'] - sum_investments_sales_df['investment_value_distribution']
sum_investments_sales_df

#### Create DataFrames for Expenditures

In [None]:
expenditure_column_names = ['respondent_id', 'expenditure_gas',
       'expenditure_other fuel', 'expenditure_adjustment',
       'expenditure_distribution', 'expenditure_hydro', 'expenditure_nuclear',
       'expenditure_other', 'expenditure_other_fossil',
       'expenditure_purchased_power', 'expenditure_renewables',
       'expenditure_steam', 'expenditure_transmission']

In [None]:
sum_expenditures_df = data_after_2010_df[expenditure_column_names].groupby(by=["respondent_id"]).sum()
expenditure_column_names = expenditure_column_names.remove('respondent_id')
sum_expenditures_df

In [None]:
# Experiencing some errors selecting columns, so respondent id is included in this sum.
# It should not be significant due to the magnitude of the numbers we are working with.
sum_expenditures_df['sum_expenditures'] = sum_expenditures_df.sum(axis=1)
sum_expenditures_df

#### Create DataFrames for Bills

In [None]:
bill_column_names = ['respondent_id', 'bill_gas', 'bill_other fuel',
       'bill_adjustment', 'bill_distribution', 'bill_hydro', 'bill_nuclear',
       'bill_other', 'bill_other_fossil', 'bill_purchased_power',
       'bill_renewables', 'bill_steam', 'bill_transmission']

In [None]:
sum_bills_df = data_after_2010_df[bill_column_names].groupby(by=["respondent_id"]).sum()
bill_column_names = bill_column_names.remove('respondent_id')

# Experiencing some errors selecting columns, so respondent id is included in this sum.
# It should not be significant due to the magnitude of the numbers we are working with.
sum_bills_df['sum_bills'] = sum_bills_df.sum(axis=1)
sum_bills_df

## Set Variables

### x1: % of investments that go to transmission and distribution

In [None]:
# x1 is the % of investments that go to transmission
x1_transmission = sum_investments_sales_df['investment_value_transmission']/sum_investments_sales_df['sum_investments']
x1_transmission

In [None]:
# x1b is the % of investments that go to distribution
x1_distribution = sum_investments_sales_df['investments_value_distribution']/sum_investments_sales_df['sum_investments']

### x2: % of expenditures spent on clean energy

In [None]:
# x2 measures the % of expenitures for "clean" energy (renewables, nuclear, and hydro)
expenditure_clean = sum_expenditures_df['expenditure_renewables'] + sum_expenditures_df['expenditure_nuclear'] + sum_expenditures_df['expenditure_hydro']

x2_portfolio = expenditure_clean / sum_expenditures_df['sum_expenditures']
x2_portfolio

### x3a: total investments over sales

In [None]:
# x3a measures the total investments, divided by sales
x3a_total_investment = sum_investments_sales_df['sum_investments']/sum_investments_sales_df['sales']
x3a_total_investment

### x3b: Clean/Fossil Investment Ratio

In [None]:
# x3b measures investments in renewables divided by fossil fuels

#columns: investments without transmission
investments_fossil = sum_investments_sales_df['investment_value_steam'] + sum_investments_sales_df['investment_value_other_fossil']
investments_clean = sum_investments_sales_df['investment_value_renewables'] + sum_investments_sales_df['investment_value_nuclear'] + sum_investments_sales_df['investment_value_hydro']
# x3b_renewables_fossil = sum_investments_sales_df['investment_value_renewables']/investments_fossil
# x3b_renewables_fossil


#renewables divided by total investments

# x2_renewables_total = sum_investments_sales_df['investment_value_renewables']/sum_investments_sales_df['sum_investments']
# x2_renewables_total

x3b_renewable_fossil_investment = (investments_clean - investments_fossil) / sum_investments_sales_df['sum_investments']
x3b_renewable_fossil_investment

### x4: Whether or not the utility is public or private (TBD)

In [None]:
x4_public_private = 0

### x5: Residential/Total Customers

This is currently done with data from 'housing' and 'customers' columns, but could be done better with the data from customers_sales.csv

In [None]:
customer_names = ['respondent_id', 'housing_units', 'customers']
sum_customers_df = data_after_2010_df[customer_names].groupby(by=["respondent_id"]).sum()
                                      
print(sum_customers_df)

In [None]:
x5_residential_ratio = sum_customers_df['housing_units']/sum_customers_df['customers']
x5_residential_ratio

### x6: Nuclear/Clean Bills

In [None]:
# x6 measures the ratio of nuclear to clean bills

bills_clean = sum_bills_df['bill_renewables'] + sum_bills_df['bill_nuclear'] + sum_bills_df['bill_hydro']

x6_nuclear_renewable_ratio = sum_bills_df['bill_nuclear']/bills_clean

for index, value in x6_nuclear_renewable_ratio.items():
    v = x6_nuclear_renewable_ratio[index]
    if np.isnan(v):
        x6_nuclear_renewable_ratio[index] = 0

x6_nuclear_renewable_ratio

### x7: Hydro/Clean Bills

In [None]:
# x7 measures the ratio of hydro to clean bills

x7_hydro_renewable_ratio = sum_bills_df['bill_hydro']/bills_clean

for index, value in x7_hydro_renewable_ratio.items():
    v = x7_hydro_renewable_ratio[index]
    if np.isnan(v):
        x7_hydro_renewable_ratio[index] = 0

x7_hydro_renewable_ratio

### x8: Regulatory Environment

In [None]:
data_from_2020 = data_after_2010_df[data_after_2010_df['year'] == 2020].copy()
data_year_mean = data_after_2010_df.groupby(['respondent_id']).agg({'rps_score': ['mean']})
data_year_mean = data_year_mean.droplevel(1, axis=1)
x8_regulatory = data_year_mean['rps_score']

In [None]:
model_data = pd.DataFrame(
    {
        'target': target, 
        'x0': x1_transmission,
        'x1': x2_portfolio,
        'x2': x3a_total_investment,
        'x3': x3b_renewable_fossil_investment,
        'x4': x4_public_private,
        'x5': x5_residential_ratio,
        'x6': x6_nuclear_renewable_ratio,
        'x7': x7_hydro_renewable_ratio,
        'x8': x8_regulatory
    }
)
model_data

#### Check for covariance

In [None]:
corr = model_data.corr()
corr.style.background_gradient(cmap='coolwarm')

#### Filter for training dataset

#### Save model_data file

In [None]:
output_dir = os.path.join('..', 'data', 'processed')

In [None]:
model_data.to_csv(os.path.join(output_dir, 'model_data.csv'), index=False)