In [30]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [14]:
df = pd.read_parquet('../data/processed/df_easternmountain_train.parquet.gzip')
df = df.reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,lat,lon,ACCESS-CM2-ssp126,ACCESS-CM2-ssp245,ACCESS-CM2-ssp370,ACCESS-CM2-ssp585,ACCESS-ESM-ssp126,ACCESS-ESM-ssp245,ACCESS-ESM-ssp370,ACCESS-ESM-ssp585,...,INM-CM5-0-ssp585,KACE-1-0-G-ssp126,KACE-1-0-G-ssp245,KACE-1-0-G-ssp370,KACE-1-0-G-ssp585,MIROC-ES2L-ssp126,MIROC-ES2L-ssp245,MIROC-ES2L-ssp370,MIROC-ES2L-ssp585,fahrenheit
0,35.0,-83.5,-6.727468,-5.954944,-5.796958,-6.59839,32.33534,32.70128,32.62352,32.840384,...,26.49236,40.921664,42.23696,41.16794,41.224388,15.588446,15.591308,12.78248,16.107224,25.106024
1,35.0,-83.5,-2.18605,-0.223744,0.212036,-2.151724,18.378806,17.0573,17.16674,17.520836,...,39.20432,46.542308,47.175674,46.968044,46.85612,17.60702,18.98069,16.40192,16.472192,38.173987
2,35.0,-83.5,21.224372,22.278074,23.57951,21.459434,19.88663,18.60953,19.436468,19.189994,...,54.35366,53.8484,52.07612,52.8305,52.528874,22.37459,23.1008,19.60484,20.23538,43.988007
3,35.0,-83.5,32.588474,30.693326,31.939016,32.53091,15.498968,15.799226,15.751094,16.176488,...,25.531016,38.36246,40.035992,38.009012,37.67162,25.817756,26.288366,23.728046,23.892692,44.257996
4,35.0,-83.5,29.976314,30.989912,30.63452,30.6464,17.372354,18.56975,17.429,17.513636,...,22.32536,33.139868,31.89623,29.169914,28.871636,16.82303,16.85363,16.170836,16.364624,26.923987


In [45]:
X = df.iloc[:, 2:-1]
y = df.iloc[:, -1]
location = df.iloc[:, :2]
model_scenario = X.columns

In [40]:
# Baseline error averaging across models and scenarios
baseline_mape = mean_absolute_percentage_error(y, X.mean(axis=1))
baseline_r2 = r2_score(y, X.mean(axis=1))
print('Baseline MAPE:', f'{baseline_mape.round(2)}')
print('Baseline R2:', f'{baseline_r2.round(2)}')

Baseline MAPE: 0.37
Baseline R2: 0.69


In [41]:
ols = LinearRegression().fit(X, y)

In [42]:
ols.score(X, y)

0.7379795206383544

In [56]:
pd.DataFrame(data=ols.coef_, index=model_scenario, columns=['coefficient']).sort_values('coefficient', ascending=False).head()

Unnamed: 0,coefficient
KACE-1-0-G-ssp245,0.074144
CanESM5-ssp370,0.056196
CNRM-CM6-1-ssp126,0.054242
ACCESS-CM2-ssp126,0.052297
CNRM-CM6-1-ssp245,0.051781


In [60]:
ridge = Ridge(alpha=0.5).fit(X, y)

In [61]:
ridge.score(X, y)

0.7379795206383541

In [62]:
pd.DataFrame(data=ridge.coef_, index=model_scenario, columns=['coefficient']).sort_values('coefficient', ascending=False).head()

Unnamed: 0,coefficient
KACE-1-0-G-ssp245,0.074144
CanESM5-ssp370,0.056196
CNRM-CM6-1-ssp126,0.054242
ACCESS-CM2-ssp126,0.052297
CNRM-CM6-1-ssp245,0.051781
