# Examine alt data - may change proposal

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import statsmodels.formula.api as smf 

### Read in data

In [2]:
happiness = pd.read_csv('../data/hapiscore_whr.csv')
fish = pd.read_csv('../data/fisfod_cons_pc.csv')
forest_area = pd.read_csv('../data/forest_area_sq_km.csv')
planted_forest_area = pd.read_csv('../data/planted_forest_area_ha.csv')
surface_area = pd.read_csv('../data/surface_area_sq_km.csv')
oil_per_cap = pd.read_csv('../data/oil_consumption_per_cap.csv')


## Happiness

In [3]:
# melt happiness from wide to long
happiness_long = happiness.melt(id_vars=['country'], var_name='year', value_name='happiness_score')


In [4]:

fig = px.scatter(happiness_long, x='year', y='happiness_score', color='country', template='plotly_dark')

# toggle visibility off by default
for trace in fig.data:
    trace.visible = 'legendonly'
    
fig.show()

## Fish consumption

In [5]:
fish_long = fish.melt(id_vars=['country'], var_name='year', value_name='pct_fish_consumption')

fig = px.scatter(fish_long, x='year', y='pct_fish_consumption', color='country', template='plotly_dark')

# toggle visibility off by default
for trace in fig.data:
    trace.visible = 'legendonly'
    
fig.show()

In [6]:
# see if countries are the same in each df
print(f'Count of countries in each:\n\n {fish_long['country'].nunique(), happiness_long['country'].nunique()}\n\n')

# see which countries are different - Countries in fish_long but not in happiness_long
print(f'in fish_long, not in happiness_long:\n\n {np.setdiff1d(fish_long['country'].unique(), happiness_long['country'].unique())}\n\n')

# see which countries are different - Countries in happiness_long but not in fish_long
print(f'in happiness_long, not in fish_long:\n\n {np.setdiff1d(happiness_long['country'].unique(), fish_long['country'].unique())}\n\n')



Count of countries in each:

 (170, 164)


in fish_long, not in happiness_long:

 ['Antigua and Barbuda' 'Bahamas' 'Barbados' 'Brunei' 'Cape Verde'
 'Dominica' 'Fiji' 'Grenada' 'Guinea-Bissau' 'Kiribati' 'North Korea'
 'Samoa' 'Sao Tome and Principe' 'Solomon Islands' 'St. Kitts and Nevis'
 'St. Lucia' 'St. Vincent and the Grenadines' 'Timor-Leste' 'Vanuatu']


in happiness_long, not in fish_long:

 ['Bahrain' 'Bhutan' 'Burundi' 'Comoros' 'Congo, Dem. Rep.' 'Kosovo'
 'Libya' 'Palestine' 'Qatar' 'Singapore' 'Somalia' 'South Sudan' 'Syria']




## Fit a regression model

In [7]:
# merge happiness_long and fish_long on country and year
happy_fish_merged = pd.merge(happiness_long, fish_long, on=['country', 'year'], how='inner').dropna()
happy_fish_merged


Unnamed: 0,country,year,happiness_score,pct_fish_consumption
6,Australia,2005,73.4,25.30
9,Belgium,2005,72.6,24.40
18,Brazil,2005,66.4,6.04
21,Canada,2005,74.2,23.60
32,Czech Republic,2005,64.4,10.00
...,...,...,...,...
2109,Vietnam,2018,53.0,37.30
2110,Yemen,2018,30.6,3.10
2111,South Africa,2018,48.8,6.35
2112,Zambia,2018,40.4,11.70


In [8]:
lm = smf.ols('happiness_score ~ np.log(pct_fish_consumption)', data=happy_fish_merged).fit()
lm.summary()

0,1,2,3
Dep. Variable:,happiness_score,R-squared:,0.14
Model:,OLS,Adj. R-squared:,0.14
Method:,Least Squares,F-statistic:,262.5
Date:,"Thu, 30 Jan 2025",Prob (F-statistic):,8.559999999999999e-55
Time:,13:43:05,Log-Likelihood:,-6049.7
No. Observations:,1609,AIC:,12100.0
Df Residuals:,1607,BIC:,12110.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,45.4684,0.619,73.458,0.000,44.254,46.682
np.log(pct_fish_consumption),3.8727,0.239,16.201,0.000,3.404,4.342

0,1,2,3
Omnibus:,145.97,Durbin-Watson:,1.678
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.959
Skew:,-0.063,Prob(JB):,1.05e-10
Kurtosis:,2.182,Cond. No.,6.96


In [15]:
lm.rsquared

0.14039609666610864

In [20]:
# plot happiness_score against pct_fish_consumption
fig = px.scatter(happy_fish_merged,
                 x='pct_fish_consumption',
                 y='happiness_score',
                #  log_x=True,
                #  color='country',
                 hover_data = ['country', 'year', 'happiness_score', 'pct_fish_consumption'],
                 template='plotly_dark',
                 trendline='ols',
                 trendline_options=dict(log_x=True),
                 trendline_color_override='red'
                 )
# fig.update_xaxes(range=[0, 100])
# fig.update_yaxes(range=[0, 100])

# for trace in fig.data:
#     trace.visible = 'legendonly'

# Add annotation
fig.add_annotation(
    x=80, y=63,
    text=f"R^2 = {round(lm.rsquared, 4)}",
    showarrow=True,
    arrowhead=1
)


fig.show()
