In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import math
import os

In [86]:
# function to read specific csv files from the output folder and merge them
def merge_owid_worldbank_data(country):
    
    country = country.lower() # just in case, lol
    
    df_owid = pd.read_csv(f'output/owid_{country}_impute.csv')
    df_worldbank = pd.read_csv(f'output/worldbank_{country}_impute.csv')
    
    df_country = df_owid.merge(df_worldbank, how='left', on='year')
    df_country.drop(['iso_code_x', 'country_x'], axis=1, inplace=True)
    df_country.rename(columns={'iso_code_y': 'iso_code',
                               'country_y': 'country'}, inplace=True)
    
    # save merged dfs
    df_country.to_csv(f'output/merged_{country}.csv', index=False)
    
    return df_country

In [87]:
# df_china = pd.read_csv('output/merged_china.csv')
# df_us = pd.read_csv('output/merged_us.csv')
# df_india = pd.read_csv('output/merged_india.csv')

df_sector = pd.read_csv('dataset\owid\ghg-emissions-by-sector.csv')
df_china =  merge_owid_worldbank_data('china')
df_us =  merge_owid_worldbank_data('us')
df_india = merge_owid_worldbank_data('india')

In [88]:
df_sector.head()

Unnamed: 0,Entity,Code,Year,"Agriculture (GHG Emissions, CAIT)","Land-Use Change and Forestry (GHG Emissions, CAIT)","Waste (GHG Emissions, CAIT)","Industry (GHG Emissions, CAIT)","Manufacturing/Construction energy (GHG Emissions, CAIT)","Transport (GHG Emissions, CAIT)","Electricity & Heat (GHG Emissions, CAIT)","Buildings (GHG Emissions, CAIT)","Fugitive from energy production (GHG Emissions, CAIT)","Other Fuel Combustion (GHG Emissions, CAIT)","Bunker Fuels (GHG Emissions, CAIT)"
0,Afghanistan,AFG,1990,8090000.0,0.0,1230000.0,50000.0,,,,,610000.0,2630000.0,
1,Afghanistan,AFG,1991,8410000.0,0.0,1320000.0,50000.0,,,,,520000.0,2400000.0,
2,Afghanistan,AFG,1992,8420000.0,0.0,1400000.0,60000.0,,,,,220000.0,2180000.0,
3,Afghanistan,AFG,1993,8500000.0,0.0,1490000.0,60000.0,,,,,160000.0,1950000.0,
4,Afghanistan,AFG,1994,8540000.0,0.0,1580000.0,70000.0,,,,,120000.0,1720000.0,


In [89]:
df_sector.columns = df_sector.columns.str.replace("(GHG Emissions, CAIT)", "")
df_sector.columns = df_sector.columns.str.replace(r"\(.*\)","")
df_sector.columns

  df_sector.columns = df_sector.columns.str.replace("(GHG Emissions, CAIT)", "")
  df_sector.columns = df_sector.columns.str.replace(r"\(.*\)","")


Index(['Entity', 'Code', 'Year', 'Agriculture ',
       'Land-Use Change and Forestry ', 'Waste ', 'Industry ',
       'Manufacturing/Construction energy ', 'Transport ',
       'Electricity & Heat ', 'Buildings ', 'Fugitive from energy production ',
       'Other Fuel Combustion ', 'Bunker Fuels '],
      dtype='object')

In [83]:
df_sector_china = df_sector[df_sector['Code'] == "CHN"]
df_sector_india = df_sector[df_sector['Code'] == "IND"]
df_sector_usa = df_sector[df_sector['Code'] == "USA"]

In [77]:
df_sector_merged_china = df_sector.merge(df_sector_china, left_on=['Code','Year','Entity'], right_on=['Code','Year','Entity'],suffixes=('', '_y'))
df_sector_merged_china.columns = df_sector_merged_china.columns.str.replace("_y", "")
df_sector_merged_china.head()


Unnamed: 0,Entity,Code,Year,Agriculture,Land-Use Change and Forestry,Waste,Industry,Manufacturing/Construction energy,Transport,Electricity & Heat,...,Land-Use Change and Forestry.1,Waste.1,Industry.1,Manufacturing/Construction energy.1,Transport.1,Electricity & Heat.1,Buildings,Fugitive from energy production,Other Fuel Combustion,Bunker Fuels
0,China,CHN,1990,588430000.0,-318410000.0,194710000.0,94350000.0,745200000.0,94200000.0,725400000.0,...,-318410000.0,194710000.0,94350000.0,745200000.0,94200000.0,725400000.0,384700000.0,156910000.0,236300000.0,5600000.0
1,China,CHN,1991,598580000.0,-318410000.0,199460000.0,112600000.0,778800000.0,100700000.0,794300000.0,...,-318410000.0,199460000.0,112600000.0,778800000.0,100700000.0,794300000.0,383300000.0,162830000.0,239060000.0,7300000.0
2,China,CHN,1992,603320000.0,-318410000.0,204210000.0,135160000.0,807800000.0,111200000.0,877700000.0,...,-318410000.0,204210000.0,135160000.0,807800000.0,111200000.0,877700000.0,360000000.0,168760000.0,232830000.0,11300000.0
3,China,CHN,1993,590480000.0,-318410000.0,208960000.0,157020000.0,861500000.0,125800000.0,997700000.0,...,-318410000.0,208960000.0,157020000.0,861500000.0,125800000.0,997700000.0,370400000.0,174680000.0,237600000.0,9900000.0
4,China,CHN,1994,610470000.0,-318410000.0,213710000.0,180080000.0,903400000.0,116400000.0,1082900000.0,...,-318410000.0,213710000.0,180080000.0,903400000.0,116400000.0,1082900000.0,341300000.0,180600000.0,246260000.0,10900000.0


In [90]:
df_sector_merged_india = df_sector.merge(df_sector_india, left_on=['Code','Year','Entity'], right_on=['Code','Year','Entity'],suffixes=('', '_y'))
df_sector_merged_india.columns = df_sector_merged_india.columns.str.replace("_y", "")
df_sector_merged_india.head()

Unnamed: 0,Entity,Code,Year,Agriculture,Land-Use Change and Forestry,Waste,Industry,Manufacturing/Construction energy,Transport,Electricity & Heat,...,Land-Use Change and Forestry.1,Waste.1,Industry.1,Manufacturing/Construction energy.1,Transport.1,Electricity & Heat.1,Buildings,Fugitive from energy production,Other Fuel Combustion,Bunker Fuels
0,India,IND,1990,566640000.0,-42470000.0,24540000.0,26450000.0,148000000.0,64400000.0,232000000.0,...,-42470000.0,24540000.0,26450000.0,148000000.0,64400000.0,232000000.0,56400000.0,36520000.0,70590000.0,5100000.0
1,India,IND,1991,574020000.0,-42470000.0,25190000.0,29000000.0,152100000.0,66900000.0,258900000.0,...,-42470000.0,25190000.0,29000000.0,152100000.0,66900000.0,258900000.0,59300000.0,39080000.0,75850000.0,4900000.0
2,India,IND,1992,580520000.0,-42470000.0,25850000.0,30060000.0,155100000.0,68500000.0,278600000.0,...,-42470000.0,25850000.0,30060000.0,155100000.0,68500000.0,278600000.0,58600000.0,38630000.0,76410000.0,5000000.0
3,India,IND,1993,587230000.0,-42470000.0,26500000.0,31900000.0,152700000.0,68600000.0,306700000.0,...,-42470000.0,26500000.0,31900000.0,152700000.0,68600000.0,306700000.0,58900000.0,35060000.0,76060000.0,5400000.0
4,India,IND,1994,595230000.0,-42470000.0,27160000.0,34290000.0,162400000.0,70800000.0,319800000.0,...,-42470000.0,27160000.0,34290000.0,162400000.0,70800000.0,319800000.0,63500000.0,35270000.0,81420000.0,5800000.0


In [91]:
df_sector_merged_usa = df_sector.merge(df_sector_usa, left_on=['Code','Year','Entity'], right_on=['Code','Year','Entity'],suffixes=('', '_y'))
df_sector_merged_usa.columns = df_sector_merged_usa.columns.str.replace("_y", "")
df_sector_merged_usa.head()

Unnamed: 0,Entity,Code,Year,Agriculture,Land-Use Change and Forestry,Waste,Industry,Manufacturing/Construction energy,Transport,Electricity & Heat,...,Land-Use Change and Forestry.1,Waste.1,Industry.1,Manufacturing/Construction energy.1,Transport.1,Electricity & Heat.1,Buildings,Fugitive from energy production,Other Fuel Combustion,Bunker Fuels
0,United States,USA,1990,358270000.0,-291160000.0,199340000.0,164710000.0,604500000.0,1427000000.0,2159800000.0,...,-291160000.0,199340000.0,164710000.0,604500000.0,1427000000.0,2159800000.0,545500000.0,360760000.0,140960000.0,130800000.0
1,United States,USA,1991,359280000.0,-291160000.0,201900000.0,154310000.0,566900000.0,1399700000.0,2179900000.0,...,-291160000.0,201900000.0,154310000.0,566900000.0,1399700000.0,2179900000.0,555200000.0,361500000.0,140410000.0,137100000.0
2,United States,USA,1992,366210000.0,-291160000.0,202080000.0,157310000.0,517700000.0,1430700000.0,2276900000.0,...,-291160000.0,202080000.0,157310000.0,517700000.0,1430700000.0,2276900000.0,565000000.0,357230000.0,127200000.0,144000000.0
3,United States,USA,1993,369970000.0,-291160000.0,200220000.0,160570000.0,532500000.0,1454200000.0,2335500000.0,...,-291160000.0,200220000.0,160570000.0,532500000.0,1454200000.0,2335500000.0,585500000.0,345680000.0,126240000.0,132400000.0
4,United States,USA,1994,375870000.0,-291160000.0,200520000.0,165510000.0,534900000.0,1506900000.0,2360800000.0,...,-291160000.0,200520000.0,165510000.0,534900000.0,1506900000.0,2360800000.0,575900000.0,348300000.0,128980000.0,130100000.0


In [4]:
## TO DO
# come up with 2 EDA questions + 1 ML question
# ...
# Covid impact to CO2 ( Mar 2020 - Mar 2021)