<h1>Dataset cleaning</h1>

In [1]:
import pandas as pd

In [2]:
emissions = pd.read_csv('co-emissions-per-capita.csv')
emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26600 entries, 0 to 26599
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Entity                             26600 non-null  object 
 1   Code                               23046 non-null  object 
 2   Year                               26600 non-null  int64  
 3   Annual CO₂ emissions (per capita)  26600 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 831.4+ KB


In [3]:
emissions = emissions.rename(columns={"Annual CO₂ emissions (per capita)": "CO2"})
emissions.head()

Unnamed: 0,Entity,Code,Year,CO2
0,Afghanistan,AFG,1949,0.001992
1,Afghanistan,AFG,1950,0.011266
2,Afghanistan,AFG,1951,0.012098
3,Afghanistan,AFG,1952,0.011946
4,Afghanistan,AFG,1953,0.013685


In [4]:
micro_idx = (emissions['Entity']=='Micronesia (country)').fillna(False)
wrl_inx = emissions['Code'].str.contains('OWID_WRL', na=False)
kos_inx = emissions['Code'].str.contains('OWID_KOS', na=False)
emissions.loc[micro_idx, 'Entity'] = 'Micronesia'
emissions.loc[wrl_inx, 'Code'] = 'WRL'
emissions.loc[kos_inx, 'Code'] = 'KOS'

In [5]:
emissions_not_countries = emissions[(emissions['Code'].isnull()) | (emissions['Code']=='WRL')]
emissions_not_countries['Entity'].unique()

array(['Africa', 'Asia', 'Asia (excl. China and India)', 'Europe',
       'Europe (excl. EU-27)', 'Europe (excl. EU-28)',
       'European Union (27)', 'European Union (28)',
       'High-income countries', 'Low-income countries',
       'Lower-middle-income countries', 'North America',
       'North America (excl. USA)', 'Oceania', 'South America',
       'Upper-middle-income countries', 'World'], dtype=object)

In [6]:
emissions_countries = emissions[emissions['Code'].notnull() & (emissions['Code']!='WRL')]
emissions_countries = emissions_countries.rename(columns={"Entity": "Country"})
emissions_countries.head()

Unnamed: 0,Country,Code,Year,CO2
0,Afghanistan,AFG,1949,0.001992
1,Afghanistan,AFG,1950,0.011266
2,Afghanistan,AFG,1951,0.012098
3,Afghanistan,AFG,1952,0.011946
4,Afghanistan,AFG,1953,0.013685


In [7]:
countries_continents = [
    ('Afghanistan', 'Asia'), ('Albania', 'Europe'), ('Algeria', 'Africa'), ('Andorra', 'Europe'), 
    ('Angola', 'Africa'), ('Anguilla', 'North America'), ('Antigua and Barbuda', 'North America'), 
    ('Argentina', 'South America'), ('Armenia', 'Asia'), ('Aruba', 'North America'), 
    ('Australia', 'Oceania'), ('Austria', 'Europe'), ('Azerbaijan', 'Asia'), ('Bahamas', 'North America'), 
    ('Bahrain', 'Asia'), ('Bangladesh', 'Asia'), ('Barbados', 'North America'), ('Belarus', 'Europe'), 
    ('Belgium', 'Europe'), ('Belize', 'North America'), ('Benin', 'Africa'), ('Bermuda', 'North America'), 
    ('Bhutan', 'Asia'), ('Bolivia', 'South America'), ('Bonaire Sint Eustatius and Saba', 'North America'), 
    ('Bosnia and Herzegovina', 'Europe'), ('Botswana', 'Africa'), ('Brazil', 'South America'), 
    ('British Virgin Islands', 'North America'), ('Brunei', 'Asia'), ('Bulgaria', 'Europe'), 
    ('Burkina Faso', 'Africa'), ('Burundi', 'Africa'), ('Cambodia', 'Asia'), ('Cameroon', 'Africa'), 
    ('Canada', 'North America'), ('Cape Verde', 'Africa'), ('Central African Republic', 'Africa'), 
    ('Chad', 'Africa'), ('Chile', 'South America'), ('China', 'Asia'), ('Colombia', 'South America'), 
    ('Comoros', 'Africa'), ('Congo', 'Africa'), ('Cook Islands', 'Oceania'), ('Costa Rica', 'North America'), 
    ("Cote d'Ivoire", 'Africa'), ('Croatia', 'Europe'), ('Cuba', 'North America'), ('Curacao', 'North America'), 
    ('Cyprus', 'Europe'), ('Czechia', 'Europe'), ('Democratic Republic of Congo', 'Africa'), ('Denmark', 'Europe'), 
    ('Djibouti', 'Africa'), ('Dominica', 'North America'), ('Dominican Republic', 'North America'), 
    ('East Timor', 'Asia'), ('Ecuador', 'South America'), ('Egypt', 'Africa'), ('El Salvador', 'North America'), 
    ('Equatorial Guinea', 'Africa'), ('Eritrea', 'Africa'), ('Estonia', 'Europe'), ('Eswatini', 'Africa'), 
    ('Ethiopia', 'Africa'), ('Faroe Islands', 'Europe'), ('Fiji', 'Oceania'), ('Finland', 'Europe'), 
    ('France', 'Europe'), ('French Polynesia', 'Oceania'), ('Gabon', 'Africa'), ('Gambia', 'Africa'), 
    ('Georgia', 'Asia'), ('Germany', 'Europe'), ('Ghana', 'Africa'), ('Greece', 'Europe'), ('Greenland', 'North America'), 
    ('Grenada', 'North America'), ('Guatemala', 'North America'), ('Guinea', 'Africa'), ('Guinea-Bissau', 'Africa'), 
    ('Guyana', 'South America'), ('Haiti', 'North America'), ('Honduras', 'North America'), ('Hong Kong', 'Asia'), 
    ('Hungary', 'Europe'), ('Iceland', 'Europe'), ('India', 'Asia'), ('Indonesia', 'Asia'), ('Iran', 'Asia'), 
    ('Iraq', 'Asia'), ('Ireland', 'Europe'), ('Israel', 'Asia'), ('Italy', 'Europe'), ('Jamaica', 'North America'), 
    ('Japan', 'Asia'), ('Jordan', 'Asia'), ('Kazakhstan', 'Asia'), ('Kenya', 'Africa'), ('Kiribati', 'Oceania'), 
    ('Kosovo', 'Europe'), ('Kuwait', 'Asia'), ('Kyrgyzstan', 'Asia'), ('Laos', 'Asia'), ('Latvia', 'Europe'), 
    ('Lebanon', 'Asia'), ('Lesotho', 'Africa'), ('Liberia', 'Africa'), ('Libya', 'Africa'), ('Liechtenstein', 'Europe'), 
    ('Lithuania', 'Europe'), ('Luxembourg', 'Europe'), ('Macao', 'Asia'), ('Madagascar', 'Africa'), 
    ('Malawi', 'Africa'), ('Malaysia', 'Asia'), ('Maldives', 'Asia'), ('Mali', 'Africa'), ('Malta', 'Europe'), 
    ('Marshall Islands', 'Oceania'), ('Mauritania', 'Africa'), ('Mauritius', 'Africa'), ('Mexico', 'North America'), 
    ('Micronesia', 'Oceania'), ('Moldova', 'Europe'), ('Mongolia', 'Asia'), ('Montenegro', 'Europe'), 
    ('Montserrat', 'North America'), ('Morocco', 'Africa'), ('Mozambique', 'Africa'), ('Myanmar', 'Asia'), 
    ('Namibia', 'Africa'), ('Nauru', 'Oceania'), ('Nepal', 'Asia'), ('Netherlands', 'Europe'), 
    ('New Caledonia', 'Oceania'), ('New Zealand', 'Oceania'), ('Nicaragua', 'North America'), ('Niger', 'Africa'), 
    ('Nigeria', 'Africa'), ('Niue', 'Oceania'), ('North Korea', 'Asia'), ('North Macedonia', 'Europe'), 
    ('Norway', 'Europe'), ('Oman', 'Asia'), ('Pakistan', 'Asia'), ('Palau', 'Oceania'), ('Palestine', 'Asia'), 
    ('Panama', 'North America'), ('Papua New Guinea', 'Oceania'), ('Paraguay', 'South America'), 
    ('Peru', 'South America'), ('Philippines', 'Asia'), ('Poland', 'Europe'), ('Portugal', 'Europe'), 
    ('Qatar', 'Asia'), ('Romania', 'Europe'), ('Russia', 'Europe'), ('Rwanda', 'Africa'), ('Saint Helena', 'Africa'), 
    ('Saint Kitts and Nevis', 'North America'), ('Saint Lucia', 'North America'), 
    ('Saint Pierre and Miquelon', 'North America'), ('Saint Vincent and the Grenadines', 'North America'), 
    ('Samoa', 'Oceania'), ('Sao Tome and Principe', 'Africa'), ('Saudi Arabia', 'Asia'), ('Senegal', 'Africa'), 
    ('Serbia', 'Europe'), ('Seychelles', 'Africa'), ('Sierra Leone', 'Africa'), ('Singapore', 'Asia'), 
    ('Sint Maarten (Dutch part)', 'North America'), ('Slovakia', 'Europe'), ('Slovenia', 'Europe'), 
    ('Solomon Islands', 'Oceania'), ('Somalia', 'Africa'), ('South Africa', 'Africa'), ('South Korea', 'Asia'), 
    ('South Sudan', 'Africa'), ('Spain', 'Europe'), ('Sri Lanka', 'Asia'), ('Sudan', 'Africa'), 
    ('Suriname', 'South America'), ('Sweden', 'Europe'), ('Switzerland', 'Europe'), ('Syria', 'Asia'), 
    ('Taiwan', 'Asia'), ('Tajikistan', 'Asia'), ('Tanzania', 'Africa'), ('Thailand', 'Asia'), ('Togo', 'Africa'), 
    ('Tonga', 'Oceania'), ('Trinidad and Tobago', 'North America'), ('Tunisia', 'Africa'), ('Turkey', 'Asia'), 
    ('Turkmenistan', 'Asia'), ('Turks and Caicos Islands', 'North America'), ('Tuvalu', 'Oceania'), 
    ('Uganda', 'Africa'), ('Ukraine', 'Europe'), ('United Arab Emirates', 'Asia'), ('United Kingdom', 'Europe'), 
    ('United States', 'North America'), ('Uruguay', 'South America'), ('Uzbekistan', 'Asia'), ('Vanuatu', 'Oceania'),
    ('Venezuela', 'South America'), ('Vietnam', 'Asia'), ('Wallis and Futuna', 'Oceania'), 
    ('Yemen', 'Asia'), ('Zambia', 'Africa'), ('Zimbabwe', 'Africa')
]


In [8]:
for country, continent in countries_continents:
    emissions_countries.loc[emissions_countries['Country']==country, 'Continent'] = continent

In [9]:
emissions_countries.Continent.unique()
# emissions_countries.to_csv('emissions_countries.csv', index=False)

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)

In [10]:
emissions_2022_countries = emissions_countries[emissions_countries['Year'] == 2022]
emissions_2022_countries = emissions_2022_countries.drop(columns=['Year'])
emissions_2022_countries = emissions_2022_countries.sort_values(by='CO2', ascending=False).reset_index(drop=True)
emissions_2022_countries.shape
# emissions_2022_countries.to_csv('emissions_2022_countries.csv', index=False)

(214, 4)

In [11]:
emissions_2022_countries['CO2'].mean()

4.581112053560747

BONUS: "Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions."

In [12]:
emissions_2022_countries.head()

Unnamed: 0,Country,Code,CO2,Continent
0,Qatar,QAT,37.601273,Asia
1,United Arab Emirates,ARE,25.833244,Asia
2,Bahrain,BHR,25.672274,Asia
3,Kuwait,KWT,25.578102,Asia
4,Brunei,BRN,23.950201,Asia


# Join population

In [13]:
population_all = pd.read_csv('world_population_data.csv')
population_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       234 non-null    int64  
 1   Rank             234 non-null    int64  
 2   country          234 non-null    object 
 3   country_code     234 non-null    object 
 4   1980             234 non-null    float64
 5   2000             234 non-null    int64  
 6   2010             234 non-null    int64  
 7   2021             234 non-null    int64  
 8   2022             234 non-null    int64  
 9   2030             234 non-null    int64  
 10  2050             234 non-null    int64  
 11  area             234 non-null    float64
 12  landAreaKm       234 non-null    float64
 13  growthRate       234 non-null    float64
 14  worldPercentage  234 non-null    float64
 15  density          234 non-null    float64
dtypes: float64(6), int64(8), object(2)
memory usage: 29.4+ KB


In [14]:
population_2022 = population_all[['country_code', '2022']].rename(columns={'country_code': 'Code', '2022': 'Population'})
population_2022.head()

Unnamed: 0,Code,Population
0,CHN,1425887337
1,IND,1417173173
2,USA,338289857
3,IDN,275501339
4,PAK,235824862


In [15]:
emissions_2022_countries = emissions_2022_countries.merge(population_2022, on='Code', how='left')

In [16]:
null_idx = emissions_2022_countries.Population.isnull()
missing_countries = emissions_2022_countries['Country'][null_idx].tolist()
missing_countries

['Kosovo', 'Bonaire Sint Eustatius and Saba', 'Saint Helena']

In [17]:
emissions_2022_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     214 non-null    object 
 1   Code        214 non-null    object 
 2   CO2         214 non-null    float64
 3   Continent   214 non-null    object 
 4   Population  211 non-null    float64
dtypes: float64(2), object(3)
memory usage: 8.5+ KB


In [18]:
emissions_2022_countries = emissions_2022_countries.dropna(subset=['Population'])
emissions_2022_countries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211 entries, 0 to 213
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     211 non-null    object 
 1   Code        211 non-null    object 
 2   CO2         211 non-null    float64
 3   Continent   211 non-null    object 
 4   Population  211 non-null    float64
dtypes: float64(2), object(3)
memory usage: 9.9+ KB


In [19]:
emissions_2022_countries['Total'] = emissions_2022_countries['CO2'] * emissions_2022_countries['Population']
emissions_2022_countries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 211 entries, 0 to 213
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     211 non-null    object 
 1   Code        211 non-null    object 
 2   CO2         211 non-null    float64
 3   Continent   211 non-null    object 
 4   Population  211 non-null    float64
 5   Total       211 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.5+ KB


In [20]:
# emissions_2022_countries.to_csv('emissions_2022_total.csv', index=False)

In [36]:
continents = emissions_2022_countries.Continent.unique()
top3 = pd.DataFrame(columns=emissions_2022_countries.columns)
for cont in continents:
    temp = emissions_2022_countries[emissions_2022_countries['Continent']==cont]
    temp = temp.sort_values(by='Total', ascending=False).reset_index(drop=True)
    temp = temp.head(2)
    top3 = pd.concat([top3, temp]).reset_index(drop=True)

top3 = top3.drop(columns=['Code', 'CO2', 'Population'])
top3.head()

Unnamed: 0,Country,Continent,Total
0,China,Asia,11396780000.0
1,India,Asia,2829644000.0
2,United States,North America,5057303000.0
3,Canada,North America,547943900.0
4,Australia,Oceania,392279300.0


In [37]:
others = emissions_2022_countries[~emissions_2022_countries['Country'].isin(top3['Country'])]
others = others.drop(columns=['Code', 'CO2', 'Population'])
others.head()

Unnamed: 0,Country,Continent,Total
0,Qatar,Asia,101340000.0
1,United Arab Emirates,Asia,243895000.0
2,Bahrain,Asia,37795570.0
3,Kuwait,Asia,109189700.0
4,Brunei,Asia,10753690.0


In [38]:
others_totals = others.groupby('Continent')['Total'].sum().reset_index()
print(others_totals)

plot4 = top3

for (cont, total) in others_totals.values:
    plot4 = pd.concat([plot4, pd.DataFrame({'Country': ["Others ("+cont+")"], 'Continent': [cont], 'Total': [total]})])

       Continent         Total
0         Africa  7.536028e+08
1           Asia  7.545926e+09
2         Europe  2.779507e+09
3  North America  6.743294e+08
4        Oceania  1.658555e+07
5  South America  4.090506e+08


In [39]:
plot4 = plot4.reset_index(drop=True)
plot4

Unnamed: 0,Country,Continent,Total
0,China,Asia,11396780000.0
1,India,Asia,2829644000.0
2,United States,North America,5057303000.0
3,Canada,North America,547943900.0
4,Australia,Oceania,392279300.0
5,New Zealand,Oceania,32211810.0
6,Russia,Europe,1652177000.0
7,Germany,Europe,665604700.0
8,South Africa,Africa,404054000.0
9,Egypt,Africa,258951700.0


In [40]:
totals = emissions_2022_countries.groupby('Continent')['Total'].sum().reset_index()
print(totals)

for (cont, total) in totals.values:
    plot4 = pd.concat([plot4, pd.DataFrame({'Country': ["Total ("+cont+")"], 'Continent': [cont], 'Total': [total]})])
    
plot4 = plot4.reset_index(drop=True)
plot4

       Continent         Total
0         Africa  1.416608e+09
1           Asia  2.177235e+10
2         Europe  5.097289e+09
3  North America  6.279577e+09
4        Oceania  4.410767e+08
5  South America  1.085392e+09


Unnamed: 0,Country,Continent,Total
0,China,Asia,11396780000.0
1,India,Asia,2829644000.0
2,United States,North America,5057303000.0
3,Canada,North America,547943900.0
4,Australia,Oceania,392279300.0
5,New Zealand,Oceania,32211810.0
6,Russia,Europe,1652177000.0
7,Germany,Europe,665604700.0
8,South Africa,Africa,404054000.0
9,Egypt,Africa,258951700.0


In [41]:
plot4.to_csv('plot4_dataset.csv', index=False)

In [42]:
heatmap = pd.read_csv('co2-fossil-plus-land-use/co2-fossil-plus-land-use.csv')

In [43]:
heatmap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44010 entries, 0 to 44009
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  44010 non-null  object 
 1   Code     37825 non-null  object 
 2   Year     44010 non-null  int64  
 3   Total    23320 non-null  float64
 4   Land     37022 non-null  float64
 5   Fossil   30308 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 2.0+ MB


In [44]:
null_idx = heatmap.Code.isnull()
heatmap[null_idx]['Country'].unique()

array(['Africa', 'Africa (GCP)', 'Asia', 'Asia (GCP)',
       'Asia (excl. China and India)', 'Central America (GCP)', 'Europe',
       'Europe (GCP)', 'Europe (excl. EU-27)', 'Europe (excl. EU-28)',
       'European Union (27)', 'European Union (28)',
       'High-income countries', 'International aviation',
       'International shipping', 'Kuwaiti Oil Fires (GCP)',
       'Leeward Islands (GCP)', 'Low-income countries',
       'Lower-middle-income countries', 'Middle East (GCP)',
       'Non-OECD (GCP)', 'North America', 'North America (GCP)',
       'North America (excl. USA)', 'OECD (GCP)', 'Oceania',
       'Oceania (GCP)', 'Panama Canal Zone (GCP)', 'Ryukyu Islands (GCP)',
       'South America', 'South America (GCP)',
       'St. Kitts-Nevis-Anguilla (GCP)', 'Upper-middle-income countries'],
      dtype=object)

In [45]:
heatmap = heatmap[heatmap['Code'].notnull() & (heatmap['Code']!='OWID_WRL')]
heatmap.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37552 entries, 0 to 44009
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  37552 non-null  object 
 1   Code     37552 non-null  object 
 2   Year     37552 non-null  int64  
 3   Total    20413 non-null  float64
 4   Land     34081 non-null  float64
 5   Fossil   23884 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 2.0+ MB


In [46]:
heatmap = heatmap.dropna(subset=['Total'])
heatmap.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20413 entries, 0 to 43956
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  20413 non-null  object 
 1   Code     20413 non-null  object 
 2   Year     20413 non-null  int64  
 3   Total    20413 non-null  float64
 4   Land     20413 non-null  float64
 5   Fossil   20413 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 1.1+ MB


In [47]:
heatmap[heatmap['Country']=='World']

Unnamed: 0,Country,Code,Year,Total,Land,Fossil


In [48]:
# heatmap.to_csv('fossil_land.csv', index=False)

In [49]:
heatmap[(heatmap['Year']==2022) & (heatmap['Country']=='China')]

Unnamed: 0,Country,Code,Year,Total,Land,Fossil
8416,China,CHN,2022,12378280000.0,981503740.0,11396780000.0
