# Cleaning and Preprocessing Variables

This notebook describes how variables were cleaned for data processing. The output files of this script will go in the clean_data/ folder.

In [1]:
# Importing libraries
import os
import requests
import pandas as pd
import numpy as np
import re

# Cleaning Non-Energy Variables

## Climate Vulnerability

In [None]:
# Read climate vulnerability with correct column types
climate_vulnerability = pd.read_csv("data/climate_vulnerability/NRI_Table_CensusTracts/NRI_Table_CensusTracts.csv", dtype = {"TRACTFIPS": str, "RISK_SCORE": float, "RISK_RATING": str})

# Subset and Rename column names
climate_vulnerability = climate_vulnerability[["TRACTFIPS", "RISK_SCORE", "RISK_RATNG"]]
climate_vulnerability.columns = ["GEOID", "climate_score", "climate_rating"]

# Format GEOID column to have 11 digits, pad with 0s as necessary
climate_vulnerability["GEOID"] = np.where(
   climate_vulnerability["GEOID"].str.len() == 10,
   "0" + climate_vulnerability["GEOID"],
   climate_vulnerability["GEOID"]
)

# Write results out
climate_vulnerability.to_csv("clean_data/non_energy/climate_vulnerability.csv", index = False)
climate_vulnerability.head()

## Social Vulnerability

In [None]:
# Read in social vulnerability
social_vulnerability = pd.read_csv("data/social_vulnerability/SVI2018_US.csv", dtype = {"FIPS": str, "F_TOTAL": float})

# Subset and Rename columns
social_vulnerability = social_vulnerability[["FIPS", "F_TOTAL"]]
social_vulnerability.columns = ["GEOID", "svi"]

# Format GEOID column to have 11 digits, pad with 0s as necessary
social_vulnerability["GEOID"] = np.where(
   social_vulnerability["GEOID"].str.len() == 10,
   "0" + social_vulnerability["GEOID"],
   social_vulnerability["GEOID"]
)

social_vulnerability.to_csv("clean_data/non_energy/social_vulnerability.csv", index = False)
social_vulnerability.head()

# Cleaning Energy Variables

In [3]:
# File and Directory setup
energy_dir = "data/energy"

e_burden_dir = "data/energy/doe_lead"
e_burden_outdir = "clean_data/energy/doe_lead"

aceee_outdir = "clean_data/energy"

## Energy Burden

Subsequent code blocks take the energy burden files downloaded from the DOE LEAD tool remove unnecesarily rows and compile them together into one CSV file.

In [None]:
e_burden_files = os.listdir(e_burden_dir)

# Remove any unwanted files
e_burden_files = [filename for filename in e_burden_files if re.match(r"(?:[A-Z])(?:[A-Z])\.csv", filename)]

In [None]:
# Re-write Energy Burden files
for e_burden_filename in e_burden_files:
   # Read in current file
   curr_fp = os.path.join(e_burden_dir, e_burden_filename)
   f = open(curr_fp, "r")
   lines = f.readlines()
   f.close()

   # Remove rows of DOE LEAD Energy burden files (metadata, description)
   lines = lines[8:]

   # Write file back without description rows
   out_fp = os.path.join(e_burden_outdir, e_burden_filename)
   f_out = open(out_fp, "w")
   f_out.writelines(lines)
   f_out.close()


In [7]:

# Compile all Energy Burden files together

# Energy Burden dataframe (collector)
e_burden = pd.DataFrame(data = None, columns = ["GEOID", "energy_burden"])

for e_burden_filename in e_burden_files:
   curr_fp = os.path.join(e_burden_outdir, e_burden_filename)
   curr_df = pd.read_csv(curr_fp, dtype={"Geography ID": str})

   # Format current dataframe
   curr_df = curr_df[["Geography ID", "Avg. Energy Burden (% income)"]]
   curr_df.columns = ["GEOID", "energy_burden"]
   curr_df["GEOID"] = np.where(
      curr_df["GEOID"].str.len() == 11,
      '0' + curr_df["GEOID"],
      curr_df["GEOID"]
   )

   # Collect current 
   e_burden = pd.concat([e_burden, curr_df])

# Writing Energy burden dataframe
e_burden.to_csv(os.path.join(e_burden_outdir, "energy_burden.csv", index = False))



## ACEEE State Scorecard Variables

**Note:** ACEEE State Scorecard Variables are reported at a state level, and so each the state level score was broadcasted and assigned to be the score for each census tract of their respective state.

In [None]:
aceee = pd.read_csv(os.path.join(energy_dir, "ACEEE State Scorecard Data, 2021.csv"))

# Capturing correct columns
aceee = aceee[
   ["STATE",
   "GAS \nSAVINGS - % \nof retail residential and commercial sales",
   "GAS SPENDING - $ \n/ residential customer",
   "ELECTRIC SAVINGS - % \nof retail sales",
   "ELECTRIC SPENDING - % \nrevenue"]]
aceee.columns = ["STATE", "gas_savings_percent", "gas_spending_usd", "electric_savings_percent", "electric_spending_percent"]

# Getting rid of excess empty rows
aceee = aceee.iloc[2:53]

# Add their state fips code
state_fips = pd.read_csv("data/support/state_fips.csv", dtype = {"STATE": str})
state_fips = state_fips[["STATE_NAME", "STATE"]]
state_fips.columns = ["STATE", "STATEFP"]
aceee = pd.merge(aceee, state_fips, how = "left", on = "STATE")

# Remove $ and % signs
aceee["gas_savings_percent"] = aceee["gas_savings_percent"].str[:-1]
aceee["gas_spending_usd"] = aceee["gas_spending_usd"].str[1:]
aceee["electric_savings_percent"] = aceee["electric_savings_percent"].str[:-1]
aceee["electric_spending_percent"] = aceee["electric_spending_percent"].str[:-1]

# Make sure all columns are correct types
aceee = aceee.astype({
   "STATE": str,
   "STATEFP": str,
   "gas_savings_percent": float,
   "gas_spending_usd": float,
   "electric_savings_percent": float,
   "electric_spending_percent": float
})

# Write ACEEE data
aceee.to_csv(os.path.join(aceee_outdir, "aceee.csv"), index = False)
aceee.head()

## Median income of solar installers by tract

In [None]:
solar_installers = pd.read_csv("data/energy/Median income of solar installers by tract.csv", dtype = {"CensusTract": str, "n": int, "median_income": float})
solar_installers.columns = ["GEOID", "num_solar_installers", "median_income_solar"]

# Make sure all tract numbers are 11 digits long pad with zeros accordingly
solar_installers["GEOID"] = np.where(
   solar_installers["GEOID"].str.len() == 10,
   "0" + solar_installers["GEOID"],
   solar_installers["GEOID"]
)

# Write file out
solar_installers.to_csv("clean_data/energy/median_income_solar_installer.csv", index = False)
solar_installers.head()


## Residential Rates as a % of Commercial and Industral rates

In [None]:
res_percent = pd.read_csv("data/energy/Residential cost per kWh as % of commercial + industrial rates.csv")
# Keep relevant columns and rows
res_percent = res_percent.iloc[3:54, [0, 8]]
# Rename columns
res_percent.columns = ["STATE", "res_rate_percent_commercial_industrial"]
# Make sure column types are correct
res_percent = res_percent.astype({
   "STATE": str,
   "res_rate_percent_commercial_industrial": str
})

# Remove % sign from residential rates and cast as float
res_percent["res_rate_percent_commercial_industrial"] = res_percent["res_rate_percent_commercial_industrial"].str[:-1]
res_percent = res_percent.astype({"res_rate_percent_commercial_industrial": float})

# Add their state fips code
state_fips = pd.read_csv("data/support/state_fips.csv", dtype = {"STATE": str})
state_fips = state_fips[["STATE_NAME", "STATE"]]
state_fips.columns = ["STATE", "STATEFP"]
res_percent = pd.merge(res_percent, state_fips, how = "left", on = "STATE")

# Write results out
res_percent.to_csv("clean_data/energy/res_rate_percent_commercial_industrial.csv", index = False)
res_percent.head()

## Shutoff Protection Scores

In [None]:
shutoff_scores = pd.read_csv("data/energy/Shutoff protection scores.csv")
# Keep relevant columns and rename them
shutoff_scores = shutoff_scores.iloc[:, 0:2]
shutoff_scores.columns = ["STATE", "shutoff_score"]

# Make sure columns are correct type
shutoff_scores = shutoff_scores.astype({
   "STATE": str,
   "shutoff_score": float
})

# Add their state fips code
state_fips = pd.read_csv("data/support/state_fips.csv", dtype = {"STATE": str})
state_fips = state_fips[["STATE_NAME", "STATE"]]
state_fips.columns = ["STATE", "STATEFP"]
shutoff_scores = pd.merge(shutoff_scores, state_fips, how = "left", on = "STATE")

# Write Shutoff scores out
shutoff_scores.to_csv("clean_data/energy/shutoff_scores.csv", index = False)
shutoff_scores.head()

# Combine all variables into one output file

In [102]:
# Use census tract metadata file as base dataframe for joins
eep = pd.read_csv("data/geography/tract_metadata.csv", dtype = {
   "STATEFP": str,
   "COUNTYFP": str,
   "TRACTCE": str,
   "GEOID": str
})

# Tables containing variables that will be joined
energy_burden = pd.read_csv("clean_data/energy/energy_burden.csv", dtype = {
   "GEOID": str,
   "energy_burden": float
})
aceee = pd.read_csv("clean_data/energy/aceee.csv", dtype = {
   "STATE": str,
   "gas_savings_percent": float,
   "gas_spending_usd": float,
   "electric_savings_percent": float,
   "electric_spending_percent": float,
   "STATEFP": str
})
res_percent = pd.read_csv("clean_data/energy/res_rate_percent_commercial_industrial.csv", dtype = {
   "STATE": str,
   "res_rate_percent_commercial_industrial": float,
   "STATEFP": str
})
res_percent = res_percent[["STATEFP", "res_rate_percent_commercial_industrial"]]
solar_installers = pd.read_csv("clean_data/energy/median_income_solar_installer.csv", dtype = {
   "GEOID": str,
   "num_solar_installers": int,
   "median_income_solar": float
})
shutoff_scores = pd.read_csv("clean_data/energy/shutoff_scores.csv", dtype = {
   "STATE": str,
   "shutoff_score": float,
   "STATEFP": str
})
shutoff_scores = shutoff_scores[["STATEFP", "shutoff_score"]]
climate_vulnerability = pd.read_csv("clean_data/non_energy/climate_vulnerability.csv", dtype = {
   "GEOID": str,
   "climate_score": float,
   "climate_rating": str
})
social_vulnerability = pd.read_csv("clean_data/non_energy/social_vulnerability.csv", dtype = {
   "GEOID": str,
   "svi": float
})
acs = pd.read_csv("data/acs/acs_data.csv", dtype = {
   "NAME": str,
   "total_pop": int,
   "state": str,
   "county": str,
   "tract": str,
   "GEOID": str,
   "white_no_hispanic": float,
   "year_built": int,
   "internet_access": float,
   "total_households": int,
   "labor_force_rate": float,
   "pop_16_older": float,
   "less_than_hs": float,
   "insured": float,
   "senior_living_alone": float,
   "median_income": float,
   "occupied_housing_units": float,
   "owner_occupied": float,
   "renter_occupied": float,
   "disabled": float,
   "pop_16_plus": float
})
acs = acs[[
   "GEOID",
   "total_pop",
   "white_no_hispanic",
   "year_built",
   "internet_access",
   "total_households",
   "labor_force_rate",
   "pop_16_older",
   "less_than_hs",
   "insured",
   "senior_living_alone",
   "median_income",
   "occupied_housing_units",
   "owner_occupied",
   "renter_occupied",
   "disabled",
   "pop_16_plus"
]]

# Join variables to the data
eep = pd.merge(eep, energy_burden, how = "left", on = "GEOID").fillna(-999)
eep = pd.merge(eep, aceee, how = "left", on = "STATEFP").fillna(-999)
eep = pd.merge(eep, res_percent, how = "left", on = "STATEFP").fillna(-999)
eep = pd.merge(eep, solar_installers, how = "left", on = "GEOID").fillna(-999)
eep = pd.merge(eep, shutoff_scores, how = "left", on = "STATEFP").fillna(-999)
eep = pd.merge(eep, climate_vulnerability, how = "left", on = "GEOID").fillna(-999)
eep = pd.merge(eep, social_vulnerability, how = "left", on = "GEOID").fillna(-999)
eep = pd.merge(eep, acs, how = "left", on = "GEOID").fillna(-999)

# Write overall EEP file out
eep.to_csv("clean_data/eep.csv", index = False)
eep.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,energy_burden,STATE,gas_savings_percent,gas_spending_usd,electric_savings_percent,electric_spending_percent,...,pop_16_older,less_than_hs,insured,senior_living_alone,median_income,occupied_housing_units,owner_occupied,renter_occupied,disabled,pop_16_plus
0,51,800,75703,51800075703,-999.0,Virginia,0.0,0.0,0.12,0.35,...,1153.0,70.0,1496.0,12.0,37762.0,540.0,464.0,76.0,9766.0,38939.0
1,51,800,75803,51800075803,-999.0,Virginia,0.0,0.0,0.12,0.35,...,1100.0,64.0,1325.0,10.0,43819.0,558.0,486.0,72.0,-666666666.0,46290.0
2,51,800,75701,51800075701,-999.0,Virginia,0.0,0.0,0.12,0.35,...,1434.0,48.0,1735.0,12.0,35379.0,840.0,459.0,381.0,25000.0,33233.0
3,51,800,75802,51800075802,-999.0,Virginia,0.0,0.0,0.12,0.35,...,1777.0,59.0,1926.0,19.0,29855.0,869.0,751.0,118.0,20000.0,38633.0
4,51,800,75404,51800075404,-999.0,Virginia,0.0,0.0,0.12,0.35,...,953.0,10.0,913.0,9.4,58824.0,479.0,432.0,47.0,47500.0,55185.0
