# Electricity Generation from api.electricitymap.org
---

SECTION 1 
collect data on carbon intensity, zones within SW , and power breakdown from the electricitymap api

In [1]:
# Dependencies and Setup

import pandas as pd
import requests


In [2]:

# zones for electrical utilities in US
zones = ["US-SW-PNM", "US-SW-EPE", "US-SW-WALC", "US-NW-PACE", "US-NW-PSCO", "US-CENT-SWPP", "US-TEX-ERCO", "US-MIDW-AECI","US-SW-AZPS","US-SW-AZPS",
         "US-NW-WACM", "US-SW-SRP", "US-SW-TEPC", "US-CENT-SPA", "US-CAL-IID", "US-CAL-CISO", "US-CAL-BANC","US-CAL-BANC", "US-CAL-TIDC", 
          "US-CAR-CPLE", "US-CAR-CPLW", "US-CAR-DUK", "US-CAR-SC", "US-CAR-SCEG", "US-CAR-YAD", "US-FLA-FMPP", "US-FLA-FPC" , "US-FLA-FPL",
          "US-FLA-GVL" , "US-FLA-HST", "US-FLA-JEA", "US-FLA-SEC", "US-FLA-TAL", "US-FLA-TEC", "US-MIDW-AECI" , "US-MIDW-LGEE", "US-MIDW-MISO",
          "US-NE-ISNE", "US-NW-BPAT", "US-NW-CHPD", "US-NW-DOPD", "US-NW-GCPD", "US-NW-GRID",  "US-NW-IPCO" , "US-NW-NWMT", "US-NW-NEVP", 
           "US-NW-PACW",  "US-NW-PGE", "US-NW-PSEI", "US-NW-SCL", "US-NW-TPWR", "US-NW-WAUW", "US-NY-NYIS", "US-SE-SEPA", "US-SE-SOCO" , 
           "US-TEN-TVA"]


In [3]:

# get carbon intensity history for the US utilities
urls = []
for index, url in enumerate(zones):
    url = f'https://api.electricitymap.org/v3/carbon-intensity/history?zone={zones[index]}'
    urls.append(url)

responses_dict = {}
for idx, url in enumerate(urls):
    response = requests.get(url)
    responses_dict[f"response_{idx+1}"] = response.json()

# Specify the file path where you want to save the JSON file
import json

file_path = "C_intensity_history_data.json"

# Write the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(responses_dict, json_file, indent=4)

print("Dictionary successfully exported to JSON file.")

df_carbon_intensity_history = pd.read_json(file_path)


Dictionary successfully exported to JSON file.


In [4]:
#request power breakdown
pburls = []
for index, url in enumerate(zones):
    pburl = f'https://api.electricitymap.org/v3/power-breakdown/history?zone={zones[index]}'
    pburls.append(pburl)

power_breakdown_responses_dict = {}
for idx, pburl in enumerate(pburls):
    response = requests.get(pburl)
    power_breakdown_responses_dict[f"response_{idx+1}"] = response.json()

# Specify the file path where you want to save the JSON file

file_path = "power_breakdown_history_data.json"

# Write the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(power_breakdown_responses_dict, json_file, indent=4)

df_power_breakdown_history = pd.read_json(file_path)

SECTION 2     
PowerBreakdown data transformation

In [5]:
# pull data from power breakdown json in dataframe
region = df_power_breakdown_history['response_1']['history'][0]['zone']
datetime = df_power_breakdown_history['response_1']['history'][0]['datetime']
nuclear = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['nuclear']
geothermal = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['geothermal']
biomass = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['biomass']
coal = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['coal']
wind = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['wind']
solar = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['solar']
hydro = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['hydro']
gas = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['gas']
oil = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['oil']
unknown = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['unknown']
hydro_discharge = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['hydro discharge']
battery_discharge = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionBreakdown"]['battery discharge']
renewable_percentage = df_power_breakdown_history['response_1']['history'][0]["renewablePercentage"]
total_consumption = df_power_breakdown_history['response_1']['history'][0]["powerConsumptionTotal"]
estimated = df_power_breakdown_history['response_1']['history'][0]["isEstimated"]

# create a dictionary with first values for this zone
us_pnm1 = {'region':region,'datetime':datetime,'nuclear':nuclear,'geothermal':geothermal,'biomass':biomass, 'coal':coal, 'wind':wind, 'solar':solar, 
           'hydro':hydro, 'gas':gas, 'oil':oil, 'unknown':unknown, 'hydro-discharge':hydro_discharge, 
           'battery_discharge':battery_discharge, 'renewable_percentage':renewable_percentage, 'total_consumption':total_consumption, 
           'estimated':estimated}

# Create a dataFrame with the first values
df_US = pd.DataFrame.from_dict(us_pnm1,orient='index')


In [6]:
df_US

Unnamed: 0,0
region,US-SW-PNM
datetime,2024-07-31T19:00:00.000Z
nuclear,0
geothermal,0
biomass,0
coal,140
wind,11
solar,1186
hydro,14
gas,576


In [7]:
# Data wrangling from the response to create a legible dataFrame
# outer for loop for regions/responses
for reg in range(len(zones)):
    #for each zone
    response = f"response_{reg+1}"
   
# pull data from json for each time in this file for this region and add to the existing dataframe
    for i in range(24):
        # 24 is for the 24 hours of data for each zone
        region = df_power_breakdown_history[f"{response}"]['history'][i]['zone']
        datetime = df_power_breakdown_history[f"{response}"]['history'][i]['datetime']
        nuclear = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['nuclear']
        geothermal = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['geothermal']
        biomass = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['biomass']
        coal = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['coal']
        wind = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['wind']
        solar = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['solar']
        hydro = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['hydro']
        gas = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['gas']
        oil = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['oil']
        unknown = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['unknown']
        hydro_discharge = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['hydro discharge']
        battery_discharge = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionBreakdown"]['battery discharge']
        renewable_percentage = df_power_breakdown_history[f"{response}"]['history'][i]["renewablePercentage"]
        total_consumption = df_power_breakdown_history[f"{response}"]['history'][i]["powerConsumptionTotal"]
        estimated = df_power_breakdown_history[f"{response}"]['history'][i]["isEstimated"]

        # this 24 is also for the 23 hours of data for each zone
        df_US[24*reg+i]= {'region':region, 'datetime':datetime,'nuclear':nuclear,'geothermal':geothermal,'biomass':biomass, 'coal':coal, 'wind':wind, 'solar':solar, 
           'hydro':hydro, 'gas':gas, 'oil':oil, 'unknown':unknown, 'hydro-discharge':hydro_discharge, 
           'battery_discharge':battery_discharge, 'renewable_percentage':renewable_percentage, 'total_consumption':total_consumption, 
           'estimated':estimated}
#set up the times as rows and measurements as columns
df_US_new = df_US.transpose()

# check data types
df_US_new.describe()

  df_US[24*reg+i]= {'region':region, 'datetime':datetime,'nuclear':nuclear,'geothermal':geothermal,'biomass':biomass, 'coal':coal, 'wind':wind, 'solar':solar,


Unnamed: 0,region,datetime,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,oil,unknown,hydro-discharge,battery_discharge,renewable_percentage,total_consumption,estimated
count,1344,1344,1337,1337,1337,1337,1337,1337,1337,1337,1337,1337,1337,1337,1341,1337,1344
unique,53,24,282,22,65,563,388,398,552,896,47,308,1,29,86,1113,2
top,US-CAL-BANC,2024-07-31T19:00:00.000Z,0,0,0,0,0,0,0,0,0,0,0,0,100,501,True
freq,48,56,823,1313,1198,605,749,682,307,198,1192,549,1337,1301,201,4,701


In [8]:
# fill NA values with zeroes for energy values
df_US_new = df_US_new.fillna({'nuclear': 0,'geothermal': 0,'biomass': 0, 'coal': 0, 'wind': 0, 'solar': 0, 
           'hydro': 0, 'gas': 0, 'oil': 0, 'unknown': 0, 'hydro-discharge':0, 'renewable_percentage':0,
           'battery_discharge':0, 'total_consumption':0})
df_US_new.describe()

Unnamed: 0,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,oil,unknown,hydro-discharge,battery_discharge,renewable_percentage,total_consumption
count,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0,1344.0
mean,1093.161458,13.297619,25.719494,1663.879464,916.250744,463.440476,593.238839,4228.567708,2.805804,115.352679,0.0,18.99628,32.53869,9134.728423
std,2384.521292,98.929177,150.201511,4427.820237,3514.061883,1920.686993,1221.423833,8464.781322,15.368793,258.150736,0.0,300.081869,33.874676,17778.320152
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,2.75,314.75,0.0,0.0,0.0,0.0,6.0,744.75
50%,0.0,0.0,0.0,34.0,0.0,0.0,74.5,1168.0,0.0,9.0,0.0,0.0,20.0,2608.0
75%,408.5,0.0,0.0,1198.0,239.0,165.5,633.5,2853.25,0.0,88.0,0.0,0.0,49.0,7908.5
max,12609.0,795.0,2416.0,35359.0,25406.0,20227.0,8734.0,79396.0,212.0,1730.0,0.0,6197.0,100.0,116532.0


In [9]:
# convert measured Energy values to integers in Giga Watts
#convert_dict = {'hydro': int}
convert_dict = {'nuclear': int, 'geothermal': int, 'biomass': int, 'coal': int, 'wind': int, 'solar': int, 'hydro': int, 'gas': int, 'oil': int, 
              'hydro-discharge': int, 'battery_discharge': int, 'renewable_percentage': int, 'total_consumption': int
               }
 # note - the unknown column only has values rarely - converting null values to integer doesn't work so this is left as an object
df_US_new = df_US_new.astype(convert_dict)

#check that data types are changed to int
df_US_new.dtypes

region                  object
datetime                object
nuclear                  int32
geothermal               int32
biomass                  int32
coal                     int32
wind                     int32
solar                    int32
hydro                    int32
gas                      int32
oil                      int32
unknown                  int64
hydro-discharge          int32
battery_discharge        int32
renewable_percentage     int32
total_consumption        int32
estimated               object
dtype: object

In [10]:
# Date Time work

# import datetime dependencies

from datetime import datetime

# set up lists to hold parsed data and DateTime as a datetime datetype
dates=[]
times = []
DateTime =[]

# convert date time strings
for i in range(len(df_US_new['datetime'])):

    # Parse the timestamp string to a datetime object
    dt_obj = datetime.strptime(df_US_new.iloc[i,1], '%Y-%m-%dT%H:%M:%S.%fZ')

    date = dt_obj.strftime('%Y-%m-%d')
    time = dt_obj.strftime('%H:%M:%S')

#add the new times and dates to lists

    dates.append(date)
    times.append(time)
    DateTime.append(dt_obj)

# add the times and dates to new columns in the data frame
df_US_new['UTC time'] = times
df_US_new['UTC date'] = dates
df_US_new['UTC DateTime'] = DateTime

In [11]:
#set the UTC DateTime as the index
df_US_new_reindex = df_US_new.set_index('UTC DateTime', inplace=True)

#drop the datetime column that contains a string
df_US_newer = df_US_new.drop('datetime', axis=1)
df_US_newer.head()

Unnamed: 0_level_0,region,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,oil,unknown,hydro-discharge,battery_discharge,renewable_percentage,total_consumption,estimated,UTC time,UTC date
UTC DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-07-31 19:00:00,US-SW-PNM,0,0,0,140,11,1186,14,576,0,0,0,0,63,1927,False,19:00:00,2024-07-31
2024-07-31 20:00:00,US-SW-PNM,0,0,0,229,52,1092,14,598,0,0,0,0,58,1985,False,20:00:00,2024-07-31
2024-07-31 21:00:00,US-SW-PNM,0,0,0,276,23,1101,14,607,0,0,0,0,56,2021,False,21:00:00,2024-07-31
2024-07-31 22:00:00,US-SW-PNM,0,0,0,317,31,1048,14,609,0,33,0,0,53,2052,False,22:00:00,2024-07-31
2024-07-31 23:00:00,US-SW-PNM,0,0,0,326,137,781,14,617,0,152,0,0,46,2027,False,23:00:00,2024-07-31


Section 3
Transform carbon intensity data

In [12]:
# pull data from C intensity json in dataframe
region = df_carbon_intensity_history['response_1']['history'][0]['zone']
datetime = df_carbon_intensity_history['response_1']['history'][0]['datetime']
carbon_Intensity = df_carbon_intensity_history['response_1']['history'][0]["carbonIntensity"]
estimated = df_carbon_intensity_history['response_1']['history'][0]["isEstimated"]

# create a dictionary with first values for this zone
us_pnm1C = {'region':region,'datetime':datetime,'Carbon_Intensity':carbon_Intensity, 'estimated':estimated}

# Create a dataFrame with the first values
df_US_C = pd.DataFrame.from_dict(us_pnm1C,orient='index')

In [13]:
# Data wrangling from the response to create a legible dataFrame for carbon intensity history

# outer for loop for regions/responses
for reg in range(len(zones)):
    response = f"response_{reg+1}"
    
# pull data from json for each time in this file for this region and add to the existing dataframe
    for i in range(24):
        # 24 is for the 24 hours of data for each zone
        region = df_carbon_intensity_history[f"{response}"]['history'][i]['zone']
        datetime = df_carbon_intensity_history[f"{response}"]['history'][i]['datetime']
        carbon_Intensity = df_carbon_intensity_history['response_1']['history'][i]["carbonIntensity"]
        estimated = df_carbon_intensity_history['response_1']['history'][i]["isEstimated"]
        
        # this 24 is also for the 24 hours of data in each zone
        df_US_C[24*reg+i]= {'region':region, 'datetime':datetime,'Carbon_Intensity':carbon_Intensity, 'estimated':estimated}

# make the datetime the rows and carbon_intensity a column
df_US_C_new = df_US_C.transpose()

# check data types
df_US_C_new.dtypes

  df_US_C[24*reg+i]= {'region':region, 'datetime':datetime,'Carbon_Intensity':carbon_Intensity, 'estimated':estimated}


region              object
datetime            object
Carbon_Intensity    object
estimated           object
dtype: object

In [14]:
# convert carbon intensity measurement to an integer in g CO2e/kWh
convert_dict_C= {'Carbon_Intensity': int}
 
df_US_C_new = df_US_C_new.astype(convert_dict_C)

#check that the datatype has been changed
df_US_C_new.dtypes

region              object
datetime            object
Carbon_Intensity     int32
estimated           object
dtype: object

In [15]:
# add the times and dates to new columns in the data frame   -    This assumes the data for carbon intensity is pulled at the same time as power breakdown
df_US_C_new['UTC time'] = times
df_US_C_new['UTC date'] = dates
df_US_C_new['UTC DateTime'] = DateTime

#set the UTC DateTime as the index
df_US_C_new_reindex = df_US_C_new.set_index('UTC DateTime', inplace=True)
#drop the datetime column that contains a string
df_US_C_newer = df_US_C_new.drop('datetime', axis=1)

Section 4
Merge dataframes

In [16]:
df_power_and_carbon= pd.merge(df_US_newer, df_US_C_newer,on=['UTC DateTime','region','UTC time','UTC date'])



df_power_and_carbon.rename(columns={'Carbon_Intensity':'Carbon_Intensity(gCO2eq/kWh)','total_consumption':'total_consumption(GW)', 'nuclear':'nuclear(GW)', 
                                    'geothermal':'geothermal(GW)', 'biomass':'biomass(GW)', 'coal':'coal(GW)', 'wind':'wind(GW)', 'solar':'solar(GW)', 
                                    'hydro':'hydro(GW)','gas':'gas(GW)', 'region_x': 'region', 'estimated_x': 'breakdown estimated?','estimated_y':'intensity estimated?'}, inplace=True)




df_power_and_carbon.head()

Unnamed: 0_level_0,region,nuclear(GW),geothermal(GW),biomass(GW),coal(GW),wind(GW),solar(GW),hydro(GW),gas(GW),oil,unknown,hydro-discharge,battery_discharge,renewable_percentage,total_consumption(GW),breakdown estimated?,UTC time,UTC date,Carbon_Intensity(gCO2eq/kWh),intensity estimated?
UTC DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-07-31 19:00:00,US-SW-PNM,0,0,0,140,11,1186,14,576,0,0,0,0,63,1927,False,19:00:00,2024-07-31,267,False
2024-07-31 20:00:00,US-SW-PNM,0,0,0,229,52,1092,14,598,0,0,0,0,58,1985,False,20:00:00,2024-07-31,320,False
2024-07-31 21:00:00,US-SW-PNM,0,0,0,276,23,1101,14,607,0,0,0,0,56,2021,False,21:00:00,2024-07-31,346,False
2024-07-31 22:00:00,US-SW-PNM,0,0,0,317,31,1048,14,609,0,33,0,0,53,2052,False,22:00:00,2024-07-31,377,False
2024-07-31 23:00:00,US-SW-PNM,0,0,0,326,137,781,14,617,0,152,0,0,46,2027,False,23:00:00,2024-07-31,427,False


In [17]:
# import previous cleaned file into a pandas dataframe
df_us_energy = pd.read_csv('data/runningUSenergy_data.csv')
df_us_energy_reindex=df_us_energy.set_index("UTC DateTime")
df_us_energy_reindex.describe()

  df_us_energy = pd.read_csv('data/runningUSenergy_data.csv')


Unnamed: 0,nuclear(GW),geothermal(GW),biomass(GW),coal(GW),wind(GW),solar(GW),hydro(GW),gas(GW),oil,unknown,...,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,total_consumption,carbon_Intensity
count,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31682.0,31682.0,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,1227.259784,14.326484,21.928337,1621.401467,593.079756,490.006259,616.478378,4134.150661,3.864276,105.082949,...,0.0,0.0,0.0,351.833333,196.125,423.166667,18.0,533.125,1552.104167,358.5
std,2567.920898,102.18498,105.184468,4267.121467,2190.356088,2011.638933,1357.329136,8075.178372,35.94219,506.352049,...,0.0,0.0,0.0,88.284437,336.617507,443.893668,0.0,127.405973,485.022888,148.050308
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,157.0,0.0,0.0,18.0,310.0,904.0,212.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,0.0,0.0,...,0.0,0.0,0.0,304.0,0.0,0.0,18.0,422.25,1191.5,212.0
50%,0.0,0.0,0.0,28.0,0.0,0.0,72.0,966.5,0.0,8.0,...,0.0,0.0,0.0,340.5,21.5,206.0,18.0,546.5,1534.5,358.5
75%,650.0,0.0,0.0,1281.0,170.0,139.0,675.75,3273.75,0.0,87.0,...,0.0,0.0,0.0,388.0,239.75,925.5,18.0,643.0,1761.25,505.0
max,14108.0,819.0,657.0,49357.0,22536.0,50487.0,65867.0,54713.0,2054.0,65561.0,...,0.0,0.0,0.0,502.0,1325.0,1055.0,18.0,718.0,2701.0,505.0


In [18]:
df_us_energy_reindex.drop_duplicates(inplace=True)
df_us_energy_reindex.describe()

Unnamed: 0,nuclear(GW),geothermal(GW),biomass(GW),coal(GW),wind(GW),solar(GW),hydro(GW),gas(GW),oil,unknown,...,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,total_consumption,carbon_Intensity
count,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31634.0,31682.0,31682.0,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,1227.259784,14.326484,21.928337,1621.401467,593.079756,490.006259,616.478378,4134.150661,3.864276,105.082949,...,0.0,0.0,0.0,351.833333,196.125,423.166667,18.0,533.125,1552.104167,358.5
std,2567.920898,102.18498,105.184468,4267.121467,2190.356088,2011.638933,1357.329136,8075.178372,35.94219,506.352049,...,0.0,0.0,0.0,88.284437,336.617507,443.893668,0.0,127.405973,485.022888,148.050308
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,157.0,0.0,0.0,18.0,310.0,904.0,212.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,0.0,0.0,...,0.0,0.0,0.0,304.0,0.0,0.0,18.0,422.25,1191.5,212.0
50%,0.0,0.0,0.0,28.0,0.0,0.0,72.0,966.5,0.0,8.0,...,0.0,0.0,0.0,340.5,21.5,206.0,18.0,546.5,1534.5,358.5
75%,650.0,0.0,0.0,1281.0,170.0,139.0,675.75,3273.75,0.0,87.0,...,0.0,0.0,0.0,388.0,239.75,925.5,18.0,643.0,1761.25,505.0
max,14108.0,819.0,657.0,49357.0,22536.0,50487.0,65867.0,54713.0,2054.0,65561.0,...,0.0,0.0,0.0,502.0,1325.0,1055.0,18.0,718.0,2701.0,505.0


In [19]:
# concatentate current data with existing file
df_both = pd.concat([df_us_energy_reindex,df_power_and_carbon])

# drop duplicate rows
df_both.drop_duplicates(inplace=True)
df_both.describe()

Unnamed: 0,nuclear(GW),geothermal(GW),biomass(GW),coal(GW),wind(GW),solar(GW),hydro(GW),gas(GW),oil,unknown,...,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,total_consumption,carbon_Intensity
count,32906.0,32906.0,32906.0,32906.0,32906.0,32906.0,32906.0,32906.0,32954.0,32954.0,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,1224.447639,14.315809,22.131161,1624.615997,606.925059,489.677688,616.033824,4143.578739,3.82955,105.638435,...,0.0,0.0,0.0,351.833333,196.125,423.166667,18.0,533.125,1552.104167,358.5
std,2562.97791,102.162468,107.500563,4278.10961,2262.720536,2010.0876,1353.242105,8099.266861,35.378151,499.185596,...,0.0,0.0,0.0,88.284437,336.617507,443.893668,0.0,127.405973,485.022888,148.050308
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,157.0,0.0,0.0,18.0,310.0,904.0,212.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,248.0,0.0,0.0,...,0.0,0.0,0.0,304.0,0.0,0.0,18.0,422.25,1191.5,212.0
50%,0.0,0.0,0.0,28.0,0.0,0.0,73.0,969.0,0.0,8.0,...,0.0,0.0,0.0,340.5,21.5,206.0,18.0,546.5,1534.5,358.5
75%,650.0,0.0,0.0,1275.0,171.0,139.0,673.0,3266.5,0.0,87.0,...,0.0,0.0,0.0,388.0,239.75,925.5,18.0,643.0,1761.25,505.0
max,14108.0,819.0,2416.0,49357.0,25406.0,50487.0,65867.0,79396.0,2054.0,65561.0,...,0.0,0.0,0.0,502.0,1325.0,1055.0,18.0,718.0,2701.0,505.0


In [20]:
df_both.to_csv(f'data/runningUSenergy_data.csv')

In [21]:
df_both_cleaned = df_both.loc[df_both['breakdown estimated?']==False,:]
df_both_cleaned.describe()

Unnamed: 0,nuclear(GW),geothermal(GW),biomass(GW),coal(GW),wind(GW),solar(GW),hydro(GW),gas(GW),oil,unknown,...,nuclear,geothermal,biomass,coal,wind,solar,hydro,gas,total_consumption,carbon_Intensity
count,9363.0,9363.0,9363.0,9363.0,9363.0,9363.0,9363.0,9363.0,9363.0,9363.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1451.079996,48.364306,73.045178,1881.11994,1112.283563,725.633558,1054.183809,5558.829328,8.807754,124.953007,...,,,,,,,,,,
std,2304.504434,183.399685,184.272707,4594.451263,3122.022771,2776.466029,1738.720858,8208.563496,59.817079,741.864913,...,,,,,,,,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
25%,0.0,0.0,0.0,0.0,0.0,0.0,15.0,276.5,0.0,0.0,...,,,,,,,,,,
50%,3.0,0.0,0.0,27.0,32.0,1.0,220.0,1544.0,0.0,12.0,...,,,,,,,,,,
75%,2255.0,0.0,7.0,992.0,433.0,188.0,1342.0,9041.5,0.0,116.0,...,,,,,,,,,,
max,13355.0,819.0,2416.0,48997.0,25406.0,50487.0,65867.0,79396.0,2054.0,65561.0,...,,,,,,,,,,


In [22]:
df_both_cleaned.to_csv(f'data/runningUSenergy_data_filtered.csv')