In [91]:
import pandas as pd

## Creation of the EIA dataframe

In [98]:
path = "../../data/_raw/eia/eia_api_intl_2020_2021.csv"
df_eia = pd.read_csv(path)



#add the name of the source
df_eia["data_source"] = "eia"

#column selection
df_eia = df_eia[["data_source", "period", "productName", "activityName", "countryRegionId", "countryRegionName", "unitName", "value", "unit"]]

#delete duplicates
"""
Duplicates have been found:
    - different idProduct for identical rows
    - 1 duplicate for USA, 2020, Coal Prod with values that are almost exactly the same
    - same data but different units
"""
df_eia = df_eia.drop_duplicates(["period", "productName", "activityName", "countryRegionId"])

#colums renaming
df_eia.rename(columns={
    'period': 'year',
    'productName': 'product_name',
    'activityName': 'activity_name',
    'countryRegionId': 'country_iso3',
    'countryRegionName': 'country_name',
    'unitName': 'unit_name'
}, inplace=True)

## Creation of the Coal dataframe

In [111]:
#Coal dataframe creation
df_eia_coal = df_eia[
    df_eia["product_name"]=='Coal'
    ]

# currently, the SDP does not have "Reserves" indicator for Coal. Reserves indicator is present in this dataset, but only in MST unit
# for now, we'll delete "Reserves" indicator to update SDP with the same scope of values.
df_eia_coal = df_eia_coal[df_eia_coal["activity_name"] != "Reserves"]



## Conversion of energy units into Mtoe

In [112]:
#creation of energy unit conversion dataframe
data = {
    'unit' : ["MTOE", "MT", 'QBTU', "TJ", "TST"],
    'to_Mtoe_divider' : [1, 1568.08988833186, 0.0396832072107753, 41868.0000184606, 1728.52322045437]
}
df_conversion = pd.DataFrame(data)

#clean value column
values_to_replace = ["--", "ie"]
df_eia_coal["value"] = df_eia_coal["value"].replace(values_to_replace, 0)

#conversion of value into float data type
df_eia_coal["value"] = df_eia_coal["value"].astype(float)

#conversion of all units in Mtoe
#left join with df_conversion dataframe 
df = df_eia_coal.merge(df_conversion, left_on="unit", right_on="unit", how="left")
df["value_MTOE"] = df["value"] / df["to_Mtoe_divider"]
df_eia_coal = df.drop(["value", "to_Mtoe_divider", 'unit', 'unit_name'], axis=1)


#df_eia_coal.head()

## Pivot activity_name columns

In [120]:
#pivot activity_name columns to fit data structure
pivot_df = pd.pivot_table(df_eia_coal, values= 'value_MTOE', index=['data_source', 'year', 'product_name', 'country_iso3', 'country_name'], columns='activity_name', fill_value=0)

# reinitialize indexes
pivot_df = pivot_df.reset_index()
# Définir le nom de l'index des lignes sur None
#pivot_df = pivot_df.rename(index={'activity_name': None})

#colums renaming
pivot_df.rename(columns={
    'Consumption': 'consumption_Mtoe',
    'Exports': 'export_Mtoe',
    'Imports': 'import_Mtoe',
    'Production': 'production_Mtoe'
}, inplace=True)

pivot_df = pivot_df.reset_index()
pivot_df.columns
#pivot_df.head()

Index(['index', 'data_source', 'year', 'product_name', 'country_iso3',
       'country_name', 'consumption_Mtoe', 'export_Mtoe', 'import_Mtoe',
       'production_Mtoe'],
      dtype='object', name='activity_name')

In [82]:
#len(df_eia_coal["value"])
#df_eia_coal["value"].shape

df_eia_coal = df_eia_coal.reset_index(drop=True)

In [72]:
#df_eia_coal.groupby(["country_iso3", "year", "activity_name", "unit"])["data_source"].count().sort_values(ascending=False)

df_eia_coal.groupby(["unit"])["data_source"].count().sort_values(ascending=False)

unit
MST     457
QBTU    426
MT      392
MTOE    390
TST     390
TJ      389
Name: data_source, dtype: int64

In [68]:
df_eia_coal[
    (df_eia_coal["country_iso3"]=="USA") &
    (df_eia_coal["year"]==2020) &
    (df_eia_coal["activity_name"]=="Consumption") &
    (df_eia_coal["unit"]=="QBTU")
]

Unnamed: 0,data_source,year,product_name,activity_name,country_iso3,country_name,unit_name,value,unit
31202,eia,2020,Coal,Consumption,USA,United States,quadrillion Btu,9.18110900000002,QBTU
33149,eia,2020,Coal,Consumption,USA,United States,quadrillion Btu,9.181109147,QBTU
