### Setup notebook and import files

In [1]:
import pandas as pd
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
#import Co2 emissions data
co2_df = pd.read_csv("./Resources/owid-co2-data.csv")
co2_df

Unnamed: 0,iso_code,country,year,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
0,AFG,Afghanistan,1949,0.015,,,,,,0.002,...,,,,,,,,,7663783.0,
1,AFG,Afghanistan,1950,0.084,475.000,0.070,,,,0.011,...,,,,,,,,,7752000.0,1.949480e+10
2,AFG,Afghanistan,1951,0.092,8.696,0.007,,,,0.012,...,,,,,,,,,7840000.0,2.006385e+10
3,AFG,Afghanistan,1952,0.092,0.000,0.000,,,,0.012,...,,,,,,,,,7936000.0,2.074235e+10
4,AFG,Afghanistan,1953,0.106,16.000,0.015,,,,0.013,...,,,,,,,,,8040000.0,2.201546e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24011,ZWE,Zimbabwe,2014,11.962,2.838,0.330,12.760,0.798,6.671,0.880,...,4.865,11.24,0.827,6.27,0.461,,,,13587000.0,2.474828e+10
24012,ZWE,Zimbabwe,2015,12.163,1.685,0.202,13.010,0.847,6.966,0.880,...,4.885,11.87,0.859,6.68,0.484,,,,13815000.0,2.503057e+10
24013,ZWE,Zimbabwe,2016,10.807,-11.146,-1.356,11.809,1.001,9.264,0.770,...,4.703,11.92,0.850,6.55,0.467,,,,14030000.0,2.515176e+10
24014,ZWE,Zimbabwe,2017,12.026,11.274,1.218,12.531,0.505,4.198,0.845,...,,,,,,,,,14237000.0,


In [3]:
#import temps by country data
temps_df = pd.read_csv("./Resources/GlobalLandTemperaturesByCountry.csv")
temps_df

Unnamed: 0,year,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Ã…land
1,1743-12-01,,,Ã…land
2,1744-01-01,,,Ã…land
3,1744-02-01,,,Ã…land
4,1744-03-01,,,Ã…land
...,...,...,...,...
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe


### Cleaning the CO2 data

Removing NAs and assessing data.  For the initial analysis, we chose to drop the entries for 'country' that obviously didn't represent an individual country or a region of the world.  We then created a new dataframe with the most pertinent columns for an exploratory analysis to try to discover a correlation between carbon output and a country's GDP.

In [4]:
# exploring data to see which countries are represented
co2_df['country'].unique().tolist()

['Afghanistan',
 'Africa',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctic Fisheries',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Asia',
 'Asia (excl. China & India)',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czechia',
 'Czechoslovakia',
 'Democratic Republic of Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'EU-27',
 'EU-28',
 'Ecuador',
 'Egypt',
 'E

In [5]:
# remove entries from 'country' column that don't represent individual countries or regions
co2_countries = co2_df.loc[(co2_df['country'] != 'World') & (co2_df['country'] != 'Kuwaiti Oil Fires') & (co2_df['country'] != 'Antarctic Fisheries') & (co2_df['country'] != 'Statistical Difference')]
co2_countries

Unnamed: 0,iso_code,country,year,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
0,AFG,Afghanistan,1949,0.015,,,,,,0.002,...,,,,,,,,,7663783.0,
1,AFG,Afghanistan,1950,0.084,475.000,0.070,,,,0.011,...,,,,,,,,,7752000.0,1.949480e+10
2,AFG,Afghanistan,1951,0.092,8.696,0.007,,,,0.012,...,,,,,,,,,7840000.0,2.006385e+10
3,AFG,Afghanistan,1952,0.092,0.000,0.000,,,,0.012,...,,,,,,,,,7936000.0,2.074235e+10
4,AFG,Afghanistan,1953,0.106,16.000,0.015,,,,0.013,...,,,,,,,,,8040000.0,2.201546e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24011,ZWE,Zimbabwe,2014,11.962,2.838,0.330,12.760,0.798,6.671,0.880,...,4.865,11.24,0.827,6.27,0.461,,,,13587000.0,2.474828e+10
24012,ZWE,Zimbabwe,2015,12.163,1.685,0.202,13.010,0.847,6.966,0.880,...,4.885,11.87,0.859,6.68,0.484,,,,13815000.0,2.503057e+10
24013,ZWE,Zimbabwe,2016,10.807,-11.146,-1.356,11.809,1.001,9.264,0.770,...,4.703,11.92,0.850,6.55,0.467,,,,14030000.0,2.515176e+10
24014,ZWE,Zimbabwe,2017,12.026,11.274,1.218,12.531,0.505,4.198,0.845,...,,,,,,,,,14237000.0,


In [6]:
# create new dataframe with most pertinent columns for initial analysis
country_data = co2_countries[['country','year','co2','population','gdp']]
country_data

Unnamed: 0,country,year,co2,population,gdp
0,Afghanistan,1949,0.015,7663783.0,
1,Afghanistan,1950,0.084,7752000.0,1.949480e+10
2,Afghanistan,1951,0.092,7840000.0,2.006385e+10
3,Afghanistan,1952,0.092,7936000.0,2.074235e+10
4,Afghanistan,1953,0.106,8040000.0,2.201546e+10
...,...,...,...,...,...
24011,Zimbabwe,2014,11.962,13587000.0,2.474828e+10
24012,Zimbabwe,2015,12.163,13815000.0,2.503057e+10
24013,Zimbabwe,2016,10.807,14030000.0,2.515176e+10
24014,Zimbabwe,2017,12.026,14237000.0,


In [7]:
#drop NAs
clean_co2_df = country_data.dropna()
clean_co2_df

Unnamed: 0,country,year,co2,population,gdp
1,Afghanistan,1950,0.084,7752000.0,1.949480e+10
2,Afghanistan,1951,0.092,7840000.0,2.006385e+10
3,Afghanistan,1952,0.092,7936000.0,2.074235e+10
4,Afghanistan,1953,0.106,8040000.0,2.201546e+10
5,Afghanistan,1954,0.106,8151000.0,2.248333e+10
...,...,...,...,...,...
24009,Zimbabwe,2012,7.695,13115000.0,2.048226e+10
24010,Zimbabwe,2013,11.632,13350000.0,2.374258e+10
24011,Zimbabwe,2014,11.962,13587000.0,2.474828e+10
24012,Zimbabwe,2015,12.163,13815000.0,2.503057e+10


In [8]:
#how many countries in temps dataset?
temp_countrylist = temps_df['Country'].unique().tolist()
len(temp_countrylist)

243

In [9]:
# how many countries in co2 dataset?
co2_countrylist = clean_co2_df['country'].unique().tolist()
len(co2_countrylist)

165

### Cleaning Temperature Data

Removing NAs and preparing for merge with Co2 data.  This dataset has temperature data for each country for each year.  However, the temperatures are recorded monthly for each year, whereas the Co2 data only has one entry for each year.  We cleaned this dataset, and found the yearly average temperature so that there would only be one temperature entry for each year.

In [10]:
# drop uncertainty column from temps
clean_temps = temps_df.drop(['AverageTemperatureUncertainty'], axis=1)
clean_temps

Unnamed: 0,year,AverageTemperature,Country
0,1743-11-01,4.384,Ã…land
1,1743-12-01,,Ã…land
2,1744-01-01,,Ã…land
3,1744-02-01,,Ã…land
4,1744-03-01,,Ã…land
...,...,...,...
577457,2013-05-01,19.059,Zimbabwe
577458,2013-06-01,17.613,Zimbabwe
577459,2013-07-01,17.000,Zimbabwe
577460,2013-08-01,19.759,Zimbabwe


In [11]:
# drop nas
clean_temps = clean_temps.dropna()

# reorder columns
clean_temps = clean_temps[['Country', 'AverageTemperature', 'year']]

# change Country to country
clean_temps = clean_temps.rename(columns={'Country':'country'})
clean_temps

Unnamed: 0,country,AverageTemperature,year
0,Ã…land,4.384,1743-11-01
5,Ã…land,1.530,1744-04-01
6,Ã…land,6.702,1744-05-01
7,Ã…land,11.609,1744-06-01
8,Ã…land,15.342,1744-07-01
...,...,...,...
577456,Zimbabwe,21.142,2013-04-01
577457,Zimbabwe,19.059,2013-05-01
577458,Zimbabwe,17.613,2013-06-01
577459,Zimbabwe,17.000,2013-07-01


In [12]:
# convert year to datetime format
clean_temps['year'] = pd.to_datetime(clean_temps['year'])
clean_temps.dtypes

country                       object
AverageTemperature           float64
year                  datetime64[ns]
dtype: object

In [13]:
# convert date to just year
clean_temps['year'] = clean_temps['year'].dt.strftime('%Y')
clean_temps.dtypes

country                object
AverageTemperature    float64
year                   object
dtype: object

In [14]:
# find average temp for each year in each country

avg_temps = clean_temps.groupby(['country','year']).mean()
clean_temps = avg_temps.reset_index()
clean_temps

Unnamed: 0,country,year,AverageTemperature
0,Afghanistan,1838,18.379571
1,Afghanistan,1840,13.413455
2,Afghanistan,1841,13.997600
3,Afghanistan,1842,15.154667
4,Afghanistan,1843,13.756250
...,...,...,...
45910,Ã…land,2009,6.489083
45911,Ã…land,2010,4.861917
45912,Ã…land,2011,7.170750
45913,Ã…land,2012,6.063917


In [15]:
clean_co2_df.dtypes

country        object
year            int64
co2           float64
population    float64
gdp           float64
dtype: object

In [16]:
clean_temps.dtypes

country                object
year                   object
AverageTemperature    float64
dtype: object

In [17]:
# convert year to int for better merging
clean_temps['year'] = clean_temps['year'].astype(int)

### Merge the two datasets together!

In [18]:
#merge temp and co2 dataframes
co2_temp = pd.merge(clean_co2_df, clean_temps, how='left',on = ['country', 'year'])
co2_temp

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature
0,Afghanistan,1950,0.084,7752000.0,1.949480e+10,13.043500
1,Afghanistan,1951,0.092,7840000.0,2.006385e+10,13.967750
2,Afghanistan,1952,0.092,7936000.0,2.074235e+10,14.175417
3,Afghanistan,1953,0.106,8040000.0,2.201546e+10,14.650750
4,Afghanistan,1954,0.106,8151000.0,2.248333e+10,13.691333
...,...,...,...,...,...,...
12843,Zimbabwe,2012,7.695,13115000.0,2.048226e+10,21.521333
12844,Zimbabwe,2013,11.632,13350000.0,2.374258e+10,20.710750
12845,Zimbabwe,2014,11.962,13587000.0,2.474828e+10,
12846,Zimbabwe,2015,12.163,13815000.0,2.503057e+10,


### Cleaning and exploring the merged data

In [19]:
# Explore countries represented
co2_temp['country'].unique().tolist()

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Democratic Republic of Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',

In [20]:
#drop NAs from merged data
co2_temp = co2_temp.dropna()
co2_temp

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature
0,Afghanistan,1950,0.084,7752000.0,1.949480e+10,13.043500
1,Afghanistan,1951,0.092,7840000.0,2.006385e+10,13.967750
2,Afghanistan,1952,0.092,7936000.0,2.074235e+10,14.175417
3,Afghanistan,1953,0.106,8040000.0,2.201546e+10,14.650750
4,Afghanistan,1954,0.106,8151000.0,2.248333e+10,13.691333
...,...,...,...,...,...,...
12840,Zimbabwe,2009,5.519,12527000.0,1.514130e+10,21.377250
12841,Zimbabwe,2010,7.707,12698000.0,1.660410e+10,21.986250
12842,Zimbabwe,2011,9.498,12894000.0,1.830726e+10,21.602417
12843,Zimbabwe,2012,7.695,13115000.0,2.048226e+10,21.521333


In [21]:
# how many countries in co2 dataset?
co2_countrylist = co2_temp['country'].unique().tolist()
len(co2_countrylist)

154

### Begin to explore correlation
We added columns for to show 'gdp per capita' based on each country's population and 'co2 per capita' to show how much CO2 is created for person in the country.  Then we can assess if there is a correlation between the gdp of a country and the co2 output.

In [22]:
# create new column for 'gdp per capita'
co2_temp['gdp_per_capita'] = co2_temp['gdp'] / co2_temp['population']
co2_temp

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita
0,Afghanistan,1950,0.084,7752000.0,1.949480e+10,13.043500,2514.808999
1,Afghanistan,1951,0.092,7840000.0,2.006385e+10,13.967750,2559.164343
2,Afghanistan,1952,0.092,7936000.0,2.074235e+10,14.175417,2613.703484
3,Afghanistan,1953,0.106,8040000.0,2.201546e+10,14.650750,2738.241719
4,Afghanistan,1954,0.106,8151000.0,2.248333e+10,13.691333,2758.352230
...,...,...,...,...,...,...,...
12840,Zimbabwe,2009,5.519,12527000.0,1.514130e+10,21.377250,1208.692995
12841,Zimbabwe,2010,7.707,12698000.0,1.660410e+10,21.986250,1307.615340
12842,Zimbabwe,2011,9.498,12894000.0,1.830726e+10,21.602417,1419.827859
12843,Zimbabwe,2012,7.695,13115000.0,2.048226e+10,21.521333,1561.743118


In [23]:
# create new column for 'co2 per capita'
co2_temp['co2_per_capita'] = (co2_temp['co2'] / co2_temp['population']) * 10000000
co2_temp

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita,co2_per_capita
0,Afghanistan,1950,0.084,7752000.0,1.949480e+10,13.043500,2514.808999,0.108359
1,Afghanistan,1951,0.092,7840000.0,2.006385e+10,13.967750,2559.164343,0.117347
2,Afghanistan,1952,0.092,7936000.0,2.074235e+10,14.175417,2613.703484,0.115927
3,Afghanistan,1953,0.106,8040000.0,2.201546e+10,14.650750,2738.241719,0.131841
4,Afghanistan,1954,0.106,8151000.0,2.248333e+10,13.691333,2758.352230,0.130045
...,...,...,...,...,...,...,...,...
12840,Zimbabwe,2009,5.519,12527000.0,1.514130e+10,21.377250,1208.692995,4.405684
12841,Zimbabwe,2010,7.707,12698000.0,1.660410e+10,21.986250,1307.615340,6.069460
12842,Zimbabwe,2011,9.498,12894000.0,1.830726e+10,21.602417,1419.827859,7.366217
12843,Zimbabwe,2012,7.695,13115000.0,2.048226e+10,21.521333,1561.743118,5.867327


To determine a baseline for whether a country is a disproportionate polluter, we broke out the data for 2016, the most recent year in the dataset.  We exported it as a csv to get a full look at the data and pick a baseline for "bad" ratio of gdp to co2 per capita.

In [25]:
co2_temp.loc[co2_temp['year'] == 2016]

Unnamed: 0,country,year,co2,population,gdp,AverageTemperature,gdp_per_capita,co2_per_capita


In [41]:
#export new combined dataframe as csv
# co2_temp.to_csv("GlobalCo2TempByCountry.csv", index=False)