In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
pd.set_option('display.max_rows', 80)

In [2]:
init_notebook_mode(connected=True)

In [3]:
df = pd.read_csv('output/dataset_worldbank_imputed.csv')
df

Unnamed: 0,Country Name,Year,CO2 emissions (metric tons per capita),CO2 emissions (kt),Urban population (% of total population),Population growth (annual %),Total greenhouse gas emissions (kt of CO2 equivalent),"Other greenhouse gas emissions, HFC, PFC and SF6 (thousand metric tons of CO2 equivalent)",Methane emissions (kt of CO2 equivalent),CO2 intensity (kg per kg of oil equivalent energy use),Energy use (kg of oil equivalent per capita)
0,Aruba,1980,174.694594,10498.621000,50.472,0.208214,424.783771,114.675152,12.675500,9.887823,67210.520275
1,Aruba,1981,165.121266,9999.909000,50.456,0.769120,519.001013,191.589042,12.937000,9.381611,63546.834639
2,Aruba,1982,182.270961,11180.683000,50.441,1.279734,564.023574,204.145519,13.182000,10.050591,70173.108043
3,Aruba,1983,92.363156,5746.189000,50.426,1.411552,530.385687,239.228605,13.362200,6.096364,35566.572603
4,Aruba,1984,228.392242,14348.971000,50.411,0.980502,571.038292,246.726780,13.648800,12.098059,87957.121196
...,...,...,...,...,...,...,...,...,...,...,...
10450,Zimbabwe,2016,0.783303,10990.000000,32.296,1.549294,29120.000000,1587.372070,11380.000000,1.758855,638.573764
10451,Zimbabwe,2017,0.718570,10230.000000,32.237,1.459406,28800.000000,1266.889815,11560.000000,1.763322,626.070626
10452,Zimbabwe,2018,0.849793,12270.000000,32.209,1.410382,31380.000000,1394.553692,11850.000000,1.770196,689.716154
10453,Zimbabwe,2019,1.044350,22202.653262,32.210,1.421142,83902.692361,23047.297739,25674.773596,1.750212,785.180953


# Data Preprocessing

In [14]:
redundant_year = [2019,2020]
df = df[df['Year'].isin(redundant_year) == False].copy()

In [15]:
df['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018], dtype=int64)

In [16]:
countryList = list(df['Country Name'].unique())
countryList.sort()

In [17]:
countryList

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Republic',
 'Central Europe and the Baltics',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Early-demographic dividend',
 'East Asia & Pacific',
 'East 

In [18]:
redundant_country = [ 
'Africa Eastern and Southern',
'Africa Western and Central',
'Arab World',
'East Asia & Pacific',
'East Asia & Pacific (IDA & IBRD countries)',
'East Asia & Pacific (excluding high income)',
'Euro area',
'Europe & Central Asia',
'Europe & Central Asia (IDA & IBRD countries)',
'Europe & Central Asia (excluding high income)',
'European Union',
'Heavily indebted poor countries (HIPC)',
'High income',
'IBRD only',
'IDA & IBRD total',
'IDA blend',
'IDA only',
'IDA total',
'Late-demographic dividend',
'Latin America & Caribbean',
'Latin America & Caribbean (excluding high income)',
'Latin America & the Caribbean (IDA & IBRD countries)'
'Low & middle income',
'Low income',
'Lower middle income',
'Middle East & North Africa',
'Middle East & North Africa (IDA & IBRD countries)',
'Middle East & North Africa (excluding high income)',
'Middle income',
'OECD members',
'Other small states',
'Pacific island small states',
'Small states',
'South Africa',
'South Asia',
'South Asia (IDA & IBRD)',
'Sub-Saharan Africa (IDA & IBRD countries)',
'Sub-Saharan Africa (excluding high income)',
'World',
'Upper middle income',
'Post-demographic dividend',
'Early-demographic dividend',
'Latin America & the Caribbean (IDA & IBRD countries)',
'Low & middle income',
'Central Europe and the Baltics',
'Sub-Saharan Africa',
'Fragile and conflict affected situations',
'North America',
'Central African Republic']

In [19]:
len(redundant_country)

48

In [20]:
df2 = df.copy()

In [21]:
df2 = df[df['Country Name'].isin(redundant_country) == False].copy()

In [22]:
df2['Country Name'].nunique()

208

In [23]:
df3 = df.copy()

In [24]:
important_country = ['High income','Middle income','Low income']

In [25]:
df3 = df3[df3['Country Name'].isin(important_country)==True]

# Exploratory Data Analysis

### What is the trends of CO2 emissions (metric tons per capita) of world?

In [26]:
fig = px.line(df[df['Country Name']=='World'], x="Year", y="CO2 emissions (metric tons per capita)", title='Trend of co2 emissions (metric tons per capital) of world')
fig.show()

### Top 10 Countries in contribution of co2 emmision (metric tons per capita)

In [27]:
temp = df2.groupby('Country Name')['CO2 emissions (metric tons per capita)'].sum()

In [28]:
temp.sort_values().tail(10)

Country Name
Brunei Darussalam        607.212679
Canada                   628.923482
Australia                642.583739
United States            723.737853
Kuwait                   801.615060
Luxembourg               882.480000
Bahrain                  892.873727
United Arab Emirates    1058.946169
Aruba                   1217.097694
Qatar                   1412.502327
Name: CO2 emissions (metric tons per capita), dtype: float64

In [29]:
temp.sort_values().head(10)

Country Name
Burundi             1.474659
Congo, Dem. Rep.    2.246694
Chad                2.369183
Ethiopia            2.707517
Uganda              2.884469
Somalia             3.021113
Rwanda              3.029400
Malawi              3.120794
Niger               3.390579
Burkina Faso        3.991881
Name: CO2 emissions (metric tons per capita), dtype: float64

In [30]:
fig = px.bar(temp.sort_values().head(10),title='Least 10 country in co2 emmision (metric tons per capital)')
fig.show()

In [31]:
fig = px.bar(temp.sort_values().tail(10),title='Top 10 country in co2 emmision (metric tons per capital)')
fig.show()

### Does country income influence the co2 emmision? (metric tons per capital)

In [32]:
df3['Country Name'].unique()

array(['High income', 'Low income', 'Middle income'], dtype=object)

In [67]:
fig = px.bar(df3.sort_values('CO2 emissions (metric tons per capita)'),x='Country Name',y='CO2 emissions (metric tons per capita)',title='Total CO2 emissions (metric tons per capita) based on countries income ')
fig.show()

### What is the correlation between population and co2 emmision (metric tons per capita)

In [56]:
redundant_country.append('Ariba')

In [57]:
fig = px.scatter(df[df['Country Name'].isin(redundant_country) == False], x='Population growth (annual %)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='Correlation between Population growth and co2 emissions (metric tons per capita)',color='Country Name')
fig.show()
redundant_country.remove('Ariba')

In [61]:
#redundant_country.append('Ariba')
fig = px.scatter(df[df['Country Name'].isin(redundant_country) == False], x='Urban population (% of total population)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='Correlation between urban population and co2 emissions (metric tons per capita)',color='Country Name')
fig.show()
#redundant_country.remove('Ariba')

In [38]:
fig = px.scatter(df[df['Country Name']=='World'], x='Urban population (% of total population)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='[World only]Correlation between urban population and co2 emissions (metric tons per capita)')
fig.show()

In [39]:
fig = px.scatter(df[df['Country Name']=='World'], x='Population growth (annual %)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='[World only]Correlation between population growth and co2 emissions (metric tons per capita)')
fig.show()

In [40]:
fig = px.scatter(df[df['Country Name']=='China'], x='Population growth (annual %)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='[China only]Correlation between population growth and co2 emissions (metric tons per capita)')
fig.show()

### What is the correlation between Energy use and co2 emmision (metric tons per capita)

In [62]:
fig = px.scatter(df[df['Country Name'].isin(redundant_country) == False], x='Energy use (kg of oil equivalent per capita)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='Correlation between energy use and co2 emissions (metric tons per capita)',color='Country Name')
fig.show()

In [63]:
redundant_country.append('Aruba')
redundant_country.append('China')
redundant_country.append('United States')
redundant_country.append('India')
redundant_country.append('Russian Federation')
fig = px.scatter(df[df['Country Name'].isin(redundant_country) == False], x='Energy use (kg of oil equivalent per capita)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='Correlation between energy use and co2 emissions (metric tons per capita)',color='Country Name')
fig.show()
redundant_country.remove('Aruba')
redundant_country.remove('China')
redundant_country.remove('United States')
redundant_country.remove('India')
redundant_country.remove('Russian Federation')

In [53]:
fig = px.scatter(df[df['Country Name']=='World'], x='Energy use (kg of oil equivalent per capita)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='[World only]Correlation between energy use and co2 emissions (metric tons per capita)')
fig.show()

In [54]:
fig = px.scatter(df[df['Country Name']=='China'], x='Energy use (kg of oil equivalent per capita)', y='CO2 emissions (metric tons per capita)',trendline="ols",title='[China only]Correlation between energy use and co2 emissions (metric tons per capita)')
fig.show()