In [10]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [11]:
df = pd.read_csv("GlobalLandTemperaturesByCountry+GHG.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,year,country_code,continent,Latitude,Longitude,GHG
0,0,1838-04-01,13.008,2.586,AFGHANISTAN,1838,AF,AS,33.0,65.0,0.0
1,1,1838-05-01,,,AFGHANISTAN,1838,AF,AS,33.0,65.0,0.0
2,2,1838-06-01,23.95,2.51,AFGHANISTAN,1838,AF,AS,33.0,65.0,0.0
3,3,1838-07-01,26.877,2.883,AFGHANISTAN,1838,AF,AS,33.0,65.0,0.0
4,4,1838-08-01,24.938,2.992,AFGHANISTAN,1838,AF,AS,33.0,65.0,0.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505976 entries, 0 to 505975
Data columns (total 11 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     505976 non-null  int64  
 1   dt                             505976 non-null  object 
 2   AverageTemperature             477890 non-null  float64
 3   AverageTemperatureUncertainty  477890 non-null  float64
 4   Country                        505976 non-null  object 
 5   year                           505976 non-null  int64  
 6   country_code                   505976 non-null  object 
 7   continent                      423813 non-null  object 
 8   Latitude                       505976 non-null  float64
 9   Longitude                      505976 non-null  float64
 10  GHG                            466425 non-null  float64
dtypes: float64(5), int64(2), object(4)
memory usage: 42.5+ MB


In [13]:
df = df.dropna()
df= df[df['GHG'] != 0]
df

Unnamed: 0.1,Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,year,country_code,continent,Latitude,Longitude,GHG
1329,1329,1949-01-01,-1.374,0.698,AFGHANISTAN,1949,AF,AS,33.0,65.0,14656.0
1330,1330,1949-02-01,3.089,0.370,AFGHANISTAN,1949,AF,AS,33.0,65.0,14656.0
1331,1331,1949-03-01,7.789,0.422,AFGHANISTAN,1949,AF,AS,33.0,65.0,14656.0
1332,1332,1949-04-01,15.152,0.456,AFGHANISTAN,1949,AF,AS,33.0,65.0,14656.0
1333,1333,1949-05-01,20.991,0.262,AFGHANISTAN,1949,AF,AS,33.0,65.0,14656.0
...,...,...,...,...,...,...,...,...,...,...,...
505970,505970,2013-04-01,21.142,0.495,ZIMBABWE,2013,ZW,AF,-20.0,30.0,658883535.0
505971,505971,2013-05-01,19.059,1.022,ZIMBABWE,2013,ZW,AF,-20.0,30.0,658883535.0
505972,505972,2013-06-01,17.613,0.473,ZIMBABWE,2013,ZW,AF,-20.0,30.0,658883535.0
505973,505973,2013-07-01,17.000,0.453,ZIMBABWE,2013,ZW,AF,-20.0,30.0,658883535.0


In [14]:
df = df.loc[(df['year'] == 1949)].groupby('Country')[['AverageTemperature', 'GHG']].mean().reset_index()

In [15]:
fig = px.scatter(df, x="AverageTemperature", y="GHG", color="Country", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [16]:
import plotly.graph_objs as go
import statsmodels.api as sm

col_name = str("AverageTemperature") + "(above Average)"
df[col_name] = (df[("AverageTemperature")] > df[("AverageTemperature")].mean())#.astype(int)

fig = px.scatter(df, x="AverageTemperature", y ="GHG", size=np.log(df["GHG"]), color=col_name, hover_name=df.index, log_x=False,marginal_x = "box",marginal_y = "box", template="simple_white", color_discrete_sequence=["#0d0887", "#9c179e"])

# linear regression
regline = sm.OLS(df["AverageTemperature"],sm.add_constant(np.log(df["GHG"]))).fit().fittedvalues

# add linear regression line for whole sample
fig.add_traces(go.Scatter(x=np.log(df["GHG"]), y=regline,
                          mode = 'lines',
                          marker_color='#fb9f3a',
                          name='OLS Trendline')
                          )

fig.show()

In [17]:
col_name = str("AverageTemperature") + ": above Average"
df[col_name] = (df[("AverageTemperature")] > df[("AverageTemperature")].mean())#.astype(int)

    
size = df["AverageTemperature"]
    
fig = px.scatter(df, x="AverageTemperature", y ="GHG", size=size, color=col_name,hover_name=df.index, log_x=False, trendline = "ols",  trendline_color_override="#bd3786",marginal_x = "box",marginal_y = "box", template="simple_white", color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

In [18]:
def scatter_plot(df, x, y, size):
    '''insert x and y as string while x entails color coding (abv avg...)'''
    
    # feature engineering
    col_name = str(y) + " above avg"
    df[col_name] = (df[y] > df[y].mean()).astype(int)
    
    size = df[size] * 1/4
    
    # plotting
    fig = px.scatter(df, x=x, y =y, size=size, color_continuous_scale=px.colors.sequential.Plasma, color=col_name,hover_name=df.index, log_x=False, trendline = "ols",  trendline_color_override="#bd3786",marginal_x = "box",marginal_y = "box", template="simple_white")
    fig.show()

In [19]:
scatter_plot(df , "AverageTemperature", "GHG", "AverageTemperature")

In [20]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df["Country"],
    y=np.log(df["GHG"]),
    name="GHG EMission",
    marker_color='#0d0887'
))
fig.add_trace(go.Bar(
    x=df["Country"],
    y=df["AverageTemperature"],
    name="Average Temperature",
    marker_color='#fdca26'
))