In [1]:
import pandas as pd 
import scipy.stats as st
import matplotlib.pyplot as plt 
import numpy as np
from matplotlib.pyplot import figure 
from scipy.stats import linregress
from scipy.optimize import curve_fit
import math

## Question 1 (Heesu) 
### Is there a correlation between yearly global average temperature and yearly average global CO2 level ?
* Heesu's data cleanup

### Findings: 
* From 1959 to 2015, both co2 levels and average global temperature has increased 
* There is a strong positive positive correlation between co2 levels and average global temperatures, therefore, as co2 levels increase, average global temperatures also increase.
* Theres is an outlier of 16.06 degrees recorded in 2015

In [2]:
global_temps = pd.read_csv("data_sets/GlobalTemperatures_kaggle.csv")
global_temps.head()

FileNotFoundError: [Errno 2] File data_sets/GlobalTemperatures_kaggle.csv does not exist: 'data_sets/GlobalTemperatures_kaggle.csv'

In [None]:
#finding out data type
np.dtype(global_temps["dt"])

In [None]:
#removing rows with NaN values
global_temps.dropna(how = "any", inplace = True)
global_temps.head()

In [None]:
# retrieving first value in list of values separated by hyphen 
splited_series = global_temps['dt'].str.split('-').str[0]
splited_series

In [None]:
#adding as new series to original dataframe 
global_temps["year"] = splited_series.astype("int64")

In [None]:
#checking if new column is added
global_temps.head()

In [None]:
#checking data type
 print(np.dtype(global_temps["year"]))

In [None]:
#getting data from last 50 years; 1966 to 2015 
average_temperatures = global_temps.groupby("year")["LandAndOceanAverageTemperature"].sum()/12
average_temperatures

In [None]:
#creating dataframe
ave_temps_year = pd.DataFrame(average_temperatures)
ave_temps_year.head()

In [None]:
#reading in excel of year and mean co2 emissions
mean_co2_emission = pd.read_excel("data_sets/global_mean_CO2_emissions_year.xlsx")
mean_co2_emission.head()

In [None]:
#merging dataframes on year
merged_df = pd.merge(ave_temps_year, mean_co2_emission, on = "year", how = "outer")
#dropping NaN values
merged_df.dropna(how = "any", inplace = True)
merged_df.head()

## Scatter plot and Line of regression

### Is there a relationship between Average Global temperature and mean global levels?
* There is a strong positive positive correlation between co2 levels and average global temperatures, therefore, as co2 levels increase, average global temperatures also increase.
* Theres is an outlier of 16.06 degrees recorded in 2015.

In [None]:
# Is there a relationship between co2 levels/average temp and time?
#mean CO2 emissions vs year
plt.figure(figsize=(20, 10))
plt.xlabel("Year",fontsize=15)
plt.ylabel("Mean Global CO2 Level by Year (parts per million)",fontsize=15)
plt.title("Mean Global CO2 Level by Year (parts per million) vs Year (1959 to 2015)",fontsize=20)
plt.plot(merged_df["year"], merged_df["yearly mean co2 emission (ppm)"])

plt.savefig("boxplot_scatter_gmaps_images/plot_co2_vs_year_heesu.png")

plt.show()

* Mean CO2 levels have increased over time between 1959 to 2015

In [None]:
#ave temp vs year
plt.figure(figsize=(20, 10))
plt.xlabel("Year",fontsize=15)
plt.ylabel("Average Global Land and Ocean Temperature (C)",fontsize=15)
plt.title("Average Global Land and Ocean Temperature (C) vs Year (1959 to 2015)",fontsize=20)
plt.plot(merged_df["year"], merged_df["LandAndOceanAverageTemperature"])

plt.savefig("boxplot_scatter_gmaps_images/plot_avetemp_vs_year_heesu.png")

plt.show()

* Average global temperature has increased from 1959 to 2015 

In [None]:
#creating x and y values
x_values = merged_df["yearly mean co2 emission (ppm)"]
y_values = merged_df["LandAndOceanAverageTemperature"]


#scatter plot configuration
plt.figure(figsize=(20, 10))
plt.scatter(x_values, y_values)
plt.xlabel("Mean Global CO2 Level by Year (parts per million)",fontsize=15)
plt.ylabel("Average Global Land and Ocean Temperature (C)",fontsize=15)
plt.title("Average Global Land and Ocean Temperatures (C)  vs  Mean Global CO2 Level by Year (parts per million)",fontsize=20)

#plotting line of regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
y_values_line = slope*x_values + intercept

line_eq = f"linear regression = {round(slope,2)}x + {round(intercept,2)}"

plt.annotate(line_eq,(360,15.2),fontsize=20,color="red")

plt.plot(x_values, y_values_line, color = "red")

plt.savefig("boxplot_scatter_gmaps_images/average_temp_co2_scatter_heesu.png")

plt.show()


In [None]:
#viewing correlation coefficients and p value
print(f"correlation coefficient {rvalue}")
print(f"p value {pvalue}")

* strong positive correlation with a very small p value. this suggest that this correlation is of statistical significance 
* as co2 levels increase, global average temperatures also increase

### Are there any temperature outliers?
* Theres is an outlier of 16.06 degrees recorded in 2015

In [None]:
#creating groupby object by year
avetemp_gb = global_temps.groupby("year")

#finding mean temperature for each of the years
max_ave_temp_series = avetemp_gb["LandAndOceanAverageTemperature"].mean()
#finding maximum mean temperature 
max_average_mean = round(avetemp_gb["LandAndOceanAverageTemperature"].mean().max(),2)


# quartile calculations for average land and ocean temperatures by year
quartiles = max_ave_temp_series.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of temperatures is: {lowerq}")
print("")
print(f"The upper quartile of temperatures is: {upperq}")
print("")
print(f"The interquartile range of temperatures is: {iqr}")
print("")
print(f"The the median of temperatures is: {quartiles[0.5]} ")
print("")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print("")
print(f"Values above {upper_bound} could be outliers.")
print("")
print(f"Average mean temperature of {max_average_mean} degrees celsius is an outlier and was recorded in 2015")

In [None]:
#turning into list
mean_ave_temp_list = list(max_ave_temp_series)

#boxplot of land and ocean mean temperatures for each year
plt.figure(figsize=(20, 10))
plt.boxplot(mean_ave_temp_list)
plt.title("Mean land and ocean temperatures for each year (1850 to 2015) ",fontsize=20)
plt.ylabel("Temperature (C)",fontsize=15)

plt.savefig("boxplot_scatter_gmaps_images/mean_temp_boxplot_heesu.png")

plt.show()

In [None]:
#finding which year the outlier temp occured
merged_df.loc[merged_df["LandAndOceanAverageTemperature"] == (merged_df["LandAndOceanAverageTemperature"].max()),:]

## Question 2 (Anne)
### Is there relationship between a Region’s average temperature and its CO2 level?



In [None]:
# Read in the files World bank dataset
file_basic_path="data_sets/"
newFile = pd.ExcelFile("data_sets/climate_change_data.xls")
climchang_df = pd.read_excel("data_sets/climate_change_data.xls", sheet_name="Data")

In [None]:
# Create df
climchang_df.columns
co2_df=climchang_df[climchang_df['Series name']=='CO2 emissions per capita (metric tons)'].copy()
co2_df
co2_df.drop(['Country code', 'Series code','Series name','SCALE','Decimals',2009,2010,2011], axis=1, inplace = True)
co2_df

In [None]:
#change column name
co2_df.rename(columns = {"Country name":"Country"}, inplace=True)
co2_df

In [None]:
for x in range(1990,2009):
    co2_df[x]=pd.to_numeric(co2_df[x], errors='coerce')
co2_df.dropna(inplace=True)
co2_df

In [None]:
# SET UP TEMPERATURE DATAFRAME

# Import data from dataset temperatures by country
df_temp = pd.read_csv("data_sets/GlobalLandTemperaturesByCountry.csv")
df_temp.head()

In [None]:
np.dtype(df_temp["dt"])

In [None]:
# Drop NaN values
df_temp.dropna(how = "any", inplace = True)
df_temp

In [None]:
# Retrieving first value in list of values separated by hyphen
splited_series = df_temp['dt'].str.split('-').str[0]
splited_series

In [None]:
# Adding as new series and check whether new column for year is added
df_temp["Year"] = splited_series.astype("int64")
df_temp.head()

In [None]:
#check data type
print(np.dtype(df_temp["Year"]))

In [None]:
# Drop dt column
df_temp.drop(["dt"], axis=1, inplace=True)
df_temp.head()

In [None]:
# Drop Uncertainty column
df_temp.drop(["AverageTemperatureUncertainty"], axis=1, inplace=True)
df_temp.head()

In [None]:
df_temp=df_temp[(df_temp["Year"]>=1990) & (df_temp["Year"]<2009)]
df_temp

In [None]:
# Print df for 1990-2008
pivot = df_temp.pivot_table(index="Country", columns="Year", values="AverageTemperature", aggfunc=np.mean)
#pivot = pivot.iloc[:,243:262]
pivot

In [None]:
# merge co2 and temp DFs
df = pivot.merge(co2_df, how="inner", on="Country", suffixes=("_temp","_CO2"))
df.head()

In [None]:
# Add column for region and check whether column has been added
df["Region"] = ""

# Create DF with data regions
region_df = pd.read_excel("data_sets/climate_change_data.xls", sheet_name="Country")

# rename column to make suitable for merge
region_df.rename(columns = {"Country name": "Country"}, inplace=True)

In [None]:
# Populate Region column through a merge and rename column
df_final = df.merge(region_df,how="inner", on="Country")
df_final = pd.merge(df, region_df, on='Country')
df_final.drop(["Region_x", "Country code", "Capital city", "Income group", "Lending category"],axis=1, inplace=True)
df_final.rename(columns = {"Region_y": "Region"}, inplace=True)
df_final.head()

In [None]:
# Clean NAN value from merged dataframe 
df_final.dropna(inplace=True)

# Clean druplicated coutnry from merged dataframe 
df_final.drop_duplicates(subset=['Country'],inplace=True)
df_final.dtypes

In [None]:
# Transfer "Region" column to a list of regions
region=df_final["Region"].unique()
# Transfer "Coutnry" column to a list of regions
country=df_final["Country"].unique()
# Create a year list
year = [x for x in range(1990,2009)]
#print(region)
#print(country)
#print(year)

In [None]:
# Create two dictionaries of CO2 and Life expectancy.
# Keys are country names and values are list of CO2/Life expectancy data from different years
co2_dict={}
temp_dict={}

In [None]:
# Loop the table
for index, row in df.iterrows():
    # Add the key (region) to the dictionary
    co2_dict[row["Country"]]=[]
    temp_dict[row["Country"]]=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_dict[row["Country"]].append(row[f"{y}_CO2"])
        temp_dict[row["Country"]].append(row[f"{y}_temp"])

#co2_dict

In [None]:
# Loop the country list
co2=[]
temp=[]
for x in co2_dict:
    for y in co2_dict[x]:
        co2.append(y)
    for y in temp_dict[x]:
        temp.append(y)
    # Plot the CO2/temp expectancy data of each region


for key in co2_dict:
    plt.scatter(co2_dict[key], temp_dict[key], alpha=0.5)

#plt.scatter(co2, lf, alpha=0.5, facecolors='none', edgecolors='blue')

# Set up the figure
plt.xlim(-5,80)
plt.ylim(-25,35)
plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel("Average temperature in celcius")
plt.title("CO2 levels vs average temperature")
plt.savefig("Question 2 plots/CO2_vs_avg_temp_all_regions.png")
plt.show()

In [None]:
# print list of regions
list(df_final["Region"].unique())

In [None]:
# Create DFs by region
sa = df_final[df_final["Region"]=="South Asia"]
eca = df_final[df_final["Region"]=="Europe & Central Asia"]
mena = df_final[df_final["Region"]=="Middle East & North Africa"]
eap = df_final[df_final["Region"]=="East Asia & Pacific"]
ssa = df_final[df_final["Region"]=="Sub-Saharan Africa"]
lac = df_final[df_final["Region"]=="Latin America & Caribbean"]
na = df_final[df_final["Region"]=="North America"]

In [None]:
# Charts South Asia
sa

In [None]:
sa_sum = sa.describe()
sa_sum

In [None]:
co2_mean=[]
temp_mean=[]
for y in year:
    co2_mean.append(sa_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(sa_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- South Asia")
ax1.set_ylim([0,1.5])
ax1.legend(loc="best", fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - South Asia")
ax2.tick_params(axis='y')

ax2.legend(loc="best",fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_SA.png")
plt.show()

print(f"Trend Analysis\nIn the SA region, both CO2 emissions and temperature have increased over the years.\nThe increase in temperature has had dips and peaks but trending upwards, simultaneously with increase in CO2 emissions.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(sa["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in sa.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # Create scatter plot (colours represent country within region)    
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5)
    # Calculate correlation efficient per country
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)

# Set up the figure
#plt.xlim(-5,80)
plt.ylim(10,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/temp_co2_SA.png')
plt.title('Average temperature in celsius vs CO2 emissions - South Asia')


print(f"Each of the colours represents a country within the region. There is a slight positive correlation between CO2 emissions and average temperature for the majority of the countries." )

In [None]:
# Europe & Central Asia
eca

In [None]:
eca_sum = eca.describe(include='all')
#print eca_sum
co2_mean=[]
temp_mean=[]

for y in year:
    co2_mean.append(eca_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(eca_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list


fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission per capita (metric tonnes)")
ax1.set_ylim([0,6])
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- Europe & Central Asia")
ax1.legend(loc="upper left", fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature in celcius')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - Europe & Central Asia")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right", fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_ECA.png")
plt.show()

print(f"Trend Analysis\nIn the ECA region, both temperature and CO2 emissions have decreased over the years.\nTemperature increases/decreases when CO2 emissions increase/decrease. This is as expected, as it will take time for a change in CO2 emissions to show its effects in change in temperature.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(eca["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in eca.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # create scatterplot
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5)    
    # calculate correlation coefficient
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)
    
# Set up the figure
#plt.xlim(-5,80)
plt.ylim(-25,25)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_ECA.png')
plt.title('Average temperature in celsius vs CO2 emissions - Europe & Central Asia')

print(f"Each of the colours represents a country within the region. There is a varying correlation (from positive to negative or no correlation at all) between CO2 emissions and average temperature for the countries within this region." )

In [None]:
#print df MENA
mena

In [None]:

#Middle East & North Africa"]
mena_sum = mena.describe(include='all')
#print(mena_sum)
co2_mean=[]
temp_mean=[]
    
for y in year:
    co2_mean.append(mena_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(mena_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- Middle East & North Africa")
ax1.legend(loc="upper left", fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - Middle East & North Africa")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right",fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_MENA.png")
plt.show()
print(f"Trend Analysis\nIn the MENA region, both temperature and CO2 levels have increased over the years (with dips but trending upwards).\nTemperature increases (and decreases) when CO2 emissions increase (decrease). This is as expected, as it will take time for a change in CO2 emissions to show its effects in change in temperature.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(mena["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in mena.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # create scatter plot
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5)  
    # calculate correlation coefficient per country within region
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)
    
# Set up the figure
#plt.xlim(-5,80)
plt.ylim(0,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_MENA.png')
plt.title('Average temperature in celsius vs CO2 emissions - Middle East & North Africa')

print(f"Each of the colours represents a country within the region. There is a varying correlation (from positive to negative or no correlation at all) between CO2 emissions and average temperature for the countries within this region." )

In [None]:
# chart EAP
eap

In [None]:
# East Asia & Pacific
eap_sum = eap.describe(include='all')
co2_mean=[]
temp_mean=[]
for y in year:
    co2_mean.append(eap_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(eap_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- East Asia & Pacific")
ax1.legend(loc="upper left", fontsize='small')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - East Asia & Pacific")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right",fontsize='small')

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_EAP.png")
plt.show()

print(f"Trend Analysis\nIn the EAP region, with dips and peaks, temperature and CO2 emissions tend to have increased over the years.\nTemperature increases when CO2 emissions increase. This is as expected, as it will take time for a change in CO2 emissions to show its effects in change in temperature.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(eap["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in eap.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
       # Create scatter plot 
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5) 
    # calculate corr efficient per country within region
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)
# Set up the figure
#plt.xlim(-5,80)
plt.ylim(-25,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_EAP.png')
plt.title('Average temperature in celsius vs CO2 emissions - East Asia & Pacific')

print(f"Each of the colours represents a country within the region. There is a varying correlation (mostly from no to light to moderate positive) between CO2 emissions and average temperature for the countries within this region." )

In [None]:
# df ssa
ssa

In [None]:
#Sub-Saharan Africa
ssa_sum = ssa.describe(include='all')
#print(ssaa_sum)
co2_mean=[]
temp_mean=[]
for y in year:
    co2_mean.append(ssa_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(ssa_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- Sub-Saharan Africa")
ax1.legend(loc="upper left",fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - Sub-Saharan Africa")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right", fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_SSA.png")
plt.show()
print(f"Trend Analysis\nBoth CO2 emissions and temperature have increased over the years.\nIncrease in CO2 emissions seem to result in high peaks of increase in temperature.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(ssa["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in ssa.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # create scatter plot
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5)  
    # calculate corr efficient per region
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)

# Set up the figure
#plt.xlim(-5,80)
plt.ylim(-25,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_SSA.png')
plt.title('Average temperature in celsius vs CO2 emissions - Sub-Saharan Africa')

print(f"Each of the colours represents a country within the region. There is a varying correlation (from positive to negative or no correlation at all) between CO2 emissions and average temperature for the countries within this region." )

In [None]:
# df lac
lac

In [None]:
#Latin America & Caribbean
lac_sum = lac.describe(include='all')
#print(mena_sum)
lac = lac.describe(include='all')
#print(eap)
co2_mean=[]
temp_mean=[]
for y in year:
    co2_mean.append(lac_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(lac_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- Latin America & Caribbean")
ax1.legend(loc="upper left", fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - Latin America & Caribbean")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right", fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_LAC.png")
plt.show()
print(f"Trend Analysis\nIn the LAC region, temperature has increased over the years whereas CO2 emissions have decreased.\nThere does not seem to be a relationship between these two variables.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(lac["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in lac.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # create scatter plot
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5)  
    # calculate corr efficient per country within region
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)

# Set up the figure
#plt.xlim(-5,80)
plt.ylim(-25,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_lac.png')
plt.title('Average temperature in celsius vs CO2 emissions - Latin America & Caribbean')

print(f"Each of the colours represents a country within the region. There is a very moderate positive or negative correlation for some of the countries, or no correlation at all between CO2 emissions and average temperature for the countries within this region." )

In [None]:
# df na
na

In [None]:
#North America
na_sum = na.describe(include='all')
#print(na)
#na = na.describe(include='all')
#print(eap)
co2_mean=[]
temp_mean=[]
for y in year:
    co2_mean.append(na_sum.iloc[5][f"{y}_CO2"]) #adding mean for all CO2 columns to list
    temp_mean.append(na_sum.iloc[5][f"{y}_temp"]) #adding mean for all temp columns to list

fig, ax1 = plt.subplots()

ax1.set_xlabel("Year")
ax1.set_ylabel("CO2 emission")
ax1.plot(year,co2_mean,label="CO2 emission per capita (metric tons)- North America")
ax1.legend(loc="upper left", fontsize="small")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Average temperature')  # we already handled the x-label with ax1
ax2.plot(year, temp_mean,color="r",label="Average temperature in celcius - North America")
ax2.tick_params(axis='y')

ax2.legend(loc="lower right", fontsize="small")

fig.tight_layout()  # otherwise the right y-label is slightly clipped

plt.savefig("Question 2 plots/CO2_vs_avg_temp_NA.png")
plt.show()
print(f"Trend Analysis\nIn the NA region, there is no clear upward or downward trend in increase in temperature and CO2 emissions.\nIncrease in CO2 emissions seems to be followed by increase in temperature. This is as expected. From 2000 onwards, temperature has increased whereas CO2 emissions have decreased.")

In [None]:
# create list for unique countries in region
countries=[]
countries.append(na["Country"]) #adding countries to list
def unique_countries(country):
    unique = []
    for country in countries:
        if country in unique:
            continue
        else:
            unique.append(country)
        return unique
print(unique_countries(country))

In [None]:
# scatter plot
# Loop the table
for index, row in na.iterrows():
    # Add the key (region) to the dictionary
    co2_tmp=[]
    temp_tmp=[]
    # Loop every year from 1990 to 2008
    for y in year:
        # Add the value to the correponding country.
        co2_tmp.append(row[f"{y}_CO2"])
        temp_tmp.append(row[f"{y}_temp"])
    # create scatter plot
    plt.scatter(co2_tmp, temp_tmp, alpha=0.5) 
    # Calculate corr efficient per country within region
    r=np.corrcoef(co2_tmp, temp_tmp)
    print(r)
 

# Set up the figure
#plt.xlim(-5,80)
plt.ylim(-25,35)

plt.xlabel("CO2 emissions per capita (metric tons)")
plt.ylabel('Average temperature in celcius')
plt.savefig('Question 2 plots/Scatterplot_temp_co2_na.png')
plt.title('Average temperature in celsius vs CO2 emissions - North America')

print(f"Each of the colours represents a country within the region. There is a light positive and light negative correlation between CO2 emissions and average temperature for the countries within this region." )

## Question 5 (Zheng)
### Is there a correlation between a country’s yearly average CO2 levels and life expectancy? Does a country’s CO2 level affect the life expectancy of a population?

### Findings: 
* Globally, life expectancy increases as CO2 emission per capita increase.
* The relationship between life expectancy and CO2 is exponential
* The R squared value of the proposed predicing model is about 0.59.
* This means that the relationship between CO2 and life expectancy may be not that close as we expected.

In [None]:
# Read Life Expectancy data from csv file 
lf_df = pd.read_csv("data_sets/Lifeexpectancy.csv")

# Clean data after 2015 for later comparison
for x in range(2016,2021):
    # drop the columns if year=x.
    lf_df=lf_df.drop(columns=[f"{x}"])
lf_df.head()

In [None]:
# Read CO2 data from csv file 
co2_df = pd.read_csv("data_sets/CO2emission_percapita.csv")

# Clean data after 2015 for later comparison
for x in range(2016,2021):
    # drop the columns if year=x.
    co2_df=co2_df.drop(columns=[f"{x}"])

# Preview CO2 emission dataframe
co2_df.head()

In [None]:
# Merge life expectancy and CO2 dataframes with the key "Country Name"
co2_lf_df=lf_df.merge(co2_df,how="inner",on="Country Name", suffixes=('_lf', '_co2'))
# Preview data
co2_lf_df.head()

In [None]:
# Clean NAN value from merged dataframe 
co2_lf_df.dropna(inplace=True)

# Clean druplicated coutnry from merged dataframe 
co2_lf_df.drop_duplicates(subset=['Country Name'],inplace=True)

# Clean negative value
for y in range(1960,2016):
    co2_lf_df=co2_lf_df[co2_lf_df[f"{y}_co2"]>=0]
    co2_lf_df=co2_lf_df[co2_lf_df[f"{y}_lf"]>=0]

co2_lf_df.reset_index(drop=True).head()

In [None]:
# Transfer "Country Name" column to a list of country
country=co2_lf_df["Country Name"].tolist()
#print(country)

In [None]:
# Create two dictionaries of CO2 and Life expectancy.
# Keys are country names and values are list of CO2/Life expectancy data
co2_dict={}
lf_dict={}

In [None]:
# Loop every country in the country list
for x in country:
    # Add the key (coutry) to the dictionary
    co2_dict[x]=[]
    lf_dict[x]=[]
    # Loop every year from 1960 to 2015
    for y in range(1960,2016):
        # Add the value to the correponding country.
        co2_dict[x].append(co2_lf_df[f"{y}_co2"][co2_lf_df["Country Name"]==x].values[0])
        lf_dict[x].append(co2_lf_df[f"{y}_lf"][co2_lf_df["Country Name"]==x].values[0])

In [None]:
# Combine all the co2/life expectancy data together for curve fitting
co2=[]
lf=[]
# Loop every coutnry in the country list
for x in country:
    for y in co2_dict[x]:
        co2.append(y)
    for y in lf_dict[x]:
        lf.append(y)
    # Plot the CO2/Life expectancy data of each country
    plt.scatter(co2_dict[x], lf_dict[x], alpha=0.5)

#plt.scatter(co2, lf, alpha=0.5, facecolors='none', edgecolors='blue')

# Set up the figure
plt.xlim(-1,31)
plt.xlabel("CO2 emission per capita")
plt.ylabel("Life expectancy")
plt.title("CO2 level vs life expectancy")
plt.savefig("CO2_vs_LifeExpectancy_Zheng_Qi\Global_CO2_vs_LifeExpectancy.png")
plt.show()

* Globally, life expectancy increases as CO2 emission per capita increase.
* The relationship between life expectancy and CO2 is exponential

In [None]:
# Define the function we will use for curve fitting.
def func(x, a, b, c):
    return a * np.exp(-b *x) + c

print(min(co2))
# Curve fitting the data
popt, pcov = curve_fit(func, co2, lf)
x_values= np.arange(0.0, 50.0, 0.1)
y_values = [func(x, *popt) for x in x_values]

line_eq = "y = " + str(round(popt[0],2)) + "*(" + str(round(-popt[1],2))+"x)+"+str(round(popt[2],2))
#print(min(co2))
residuals=[]
for i in range(len(co2)):
    residuals.append(lf[i]- func(co2[i], popt[0],popt[1],popt[2]))
ss_res=0.0
for i in range(len(co2)):
    ss_res += residuals[i]**2
    ss_tot = np.sum((lf-np.mean(lf))**2)
r_squared = 1 - (ss_res / ss_tot)
#perr = np.sqrt(np.diag(pcov))
print(f"The r squared value is {round(r_squared, 3)}")

plt.scatter(co2, lf, alpha=0.5, facecolors='none', edgecolors='blue')
plt.plot(x_values,y_values,"r-")
plt.annotate(line_eq,(5,40),fontsize=15,color="red")
plt.xlim(-1,31)
plt.xlabel("CO2 emission per capita")
plt.ylabel("Life expectancy")
plt.title("CO2 level vs life expectancy")

plt.savefig("CO2_vs_LifeExpectancy_Zheng_Qi\Curve_fitting_Global_CO2_vs_LifeExpectancy.png")
plt.show()

* The R squared value of the proposed predicing model is about 0.59.
* This means that the relationship between CO2 and life expectancy may be not that close as we expected.

### Is there a correlation between average CO2 emission vs average Life expectancy?
* The average values of CO2/life expectancy show that no obvious correlation is found.
* The relationship between CO2 and life expectancy should be examined regionally.

In [None]:
co2_lf_sum=co2_lf_df.describe()
co2_mean=[]
lf_mean=[]
# Create year list
year=[x for x in range(1960,2016)]
for y in year:
    co2_mean.append(co2_lf_sum.iloc[1][f"{y}_co2"])
    lf_mean.append(co2_lf_sum.iloc[1][f"{y}_lf"])

fig, ax1 = plt.subplots()

# Plot average CO2 emission
ax1.set_xlabel("year")
ax1.set_ylabel("CO2 emission per capita")
ax1.plot(year,co2_mean,label="CO2 emission per capita")
#ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(loc="best")

# Plot average life expectancy
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Life expectancy')  # we already handled the x-label with ax1
ax2.plot(year, lf_mean,color="r",label="Life expectancy")
ax2.tick_params(axis='y')

ax2.legend(loc="best")
plt.title("Average CO2 level vs  life expectancy")
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig("CO2_vs_LifeExpectancy_Zheng_Qi\Global_CO2_vs_LifeExpectancy_average_value.png")
plt.show()

* The average values of CO2/life expectancy show that no obvious correlation is found.
* The relationship between CO2 and life expectancy should be examined regionally.

### Is there an effect of CO2 emission on Life expectancy in different countries?
* Developed countries: The CO2 emissions in developed countries are higher. CO2 emission change has little effect on life expectancy.
* Developing countries: The CO2 emissions in developing countries are lower. Life expectancy increases as CO2 emission increases in developing countries.

### Developed countries

In [None]:
# Developed countries list
Developed_Country=["Norway","Australia","Denmark"]
fig, ax1 = plt.subplots()

ax1.set_xlabel("year")
ax1.set_ylabel("CO2 emission per capita")
# plot co2 emission in developed countries
for x in Developed_Country:
    ax1.plot(year,co2_dict[x])
ax1.tick_params(axis='y')
ax1.legend(Developed_Country,title='CO2 emission',loc='upper left')

# plot life expectancy in developed countries
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Life expectancy')  # we already handled the x-label with ax1
for x in Developed_Country:
    ax2.plot(year, lf_dict[x],linestyle='dashed')
ax2.tick_params(axis='y')
ax2.legend(Developed_Country, title='Life expectancy',loc="lower right")

plt.title("CO2 level vs  life expectancy in developed countries")
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig("CO2_vs_LifeExpectancy_Zheng_Qi\CO2_vs_LifeExpectancy_developed_countries.png")
plt.show()

* The CO2 emissions in developed coutries are higher.
* CO2 emission change has little effect on life expectancy.

### Developing countries

In [None]:
Developing_Country=["Brazil","China","Iraq"]
fig, ax1 = plt.subplots()

ax1.set_xlabel("year")
ax1.set_ylabel("CO2 emission per capita")
# plot co2 emission in developing countries
for x in Developing_Country:
    ax1.plot(year,co2_dict[x])
    
#ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(Developing_Country,title='CO2 emission',loc='upper left')

# plot life expectancy in developing countries
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Life expectancy')  # we already handled the x-label with ax1
for x in Developing_Country:
    ax2.plot(year, lf_dict[x],linestyle='dashed',)
ax2.tick_params(axis='y')
ax2.legend(Developing_Country, title='Life expectancy',loc="lower right")

plt.title("CO2 level vs  life expectancy in developing countries")
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig("CO2_vs_LifeExpectancy_Zheng_Qi\CO2_vs_LifeExpectancy_developing_countries.png")
plt.show()

* The CO2 emissions in developing coutries are lower.
* Life expectancy increases as CO2 emission increases in developing countries.