# Linear Regression Attempt

In [1]:
import sys

import xarray as xr
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [2]:
train_path = '../data/train/train_east_tasmin_CMIP6_projections.parquet'
val_path =  '../data/val/val_east_tasmin_CMIP6_projections.csv'

In [3]:
train_df = pd.read_parquet(train_path)
val_df = pd.read_csv(val_path,index_col=0)

In [4]:
# some quick fixes for the dfs
train_df = train_df.rename({'tsmin_df':'tasmin'},axis=1)

In [5]:
train_df.head()

Unnamed: 0,model,scenario,lat,lon,time,spatial_ref,tasmin
0,ACCESS-CM2,ssp126,35.125,-83.625,2015-01-01 12:00:00,0,251.63474
1,ACCESS-CM2,ssp126,35.125,-83.375,2015-01-01 12:00:00,0,252.6675
2,ACCESS-CM2,ssp126,35.125,-83.125,2015-01-01 12:00:00,0,251.1967
3,ACCESS-CM2,ssp126,35.375,-83.375,2015-01-01 12:00:00,0,251.56654
4,ACCESS-CM2,ssp126,35.375,-83.125,2015-01-01 12:00:00,0,249.87247


In [6]:
val_df.head()

Unnamed: 0,lat,model,scenario,time,lon,spatial_ref,tasmin
0,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.625,0,270.24707
1,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.375,0,271.58322
2,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.125,0,270.1009
25,35.125,ACCESS-CM2,ssp126,2022-01-02 12:00:00,-83.625,0,269.40863
26,35.125,ACCESS-CM2,ssp126,2022-01-02 12:00:00,-83.375,0,270.4561


In [7]:
train_df.describe()

Unnamed: 0,lat,lon,spatial_ref,tasmin
count,22893140.0,22893140.0,22893136.0,22893140.0
mean,37.37967,-80.00206,0.0,185.9745
std,1.293378,1.564687,0.0,133.048
min,35.125,-83.625,0.0,0.0
25%,36.375,-81.125,0.0,0.0
50%,37.625,-79.625,0.0,273.0379
75%,38.625,-78.875,0.0,285.3169
max,39.625,-77.625,0.0,308.4099


In [8]:
train_df['model'].unique()

array(['ACCESS-CM2', 'ACCESS-ESM', 'CMCC-ESM2', 'CNRM-CM6-1',
       'CNRM-ESM2-', 'CanESM5', 'EC-Earth3', 'EC-Earth3-', 'FGOALS-g3',
       'GFDL-ESM4', 'GISS-E2-1-', 'INM-CM4-8', 'INM-CM5-0', 'KACE-1-0-G',
       'MIROC-ES2L', 'MRI-ESM2-0', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1',
       'UKESM1-0-L'], dtype=object)

In [9]:
def groupby_model(df):
    """Process the train and val dfs for encoding as input into ML methods
    
    Arg: df (DataFrame) of tasmin from NEX-GDDP-CMIP6 data
    Output: DataFrame grouped by model and scenario, compressing the entire polygon
    """
    # Some quick transforms
    df['tasmin']=df['tasmin'].replace({0:np.nan}) # 0K is absolute zero and assumed invalid input
    df['time']=pd.to_datetime(df['time']).dt.date # Only need date
    grouped = df.groupby(['scenario','model','time'])['tasmin'].min().reset_index() #Min is more efficient. Searching the whole area
    #make new categorial value
    grouped['model_sc']=grouped['scenario']+'_'+grouped['model']
    return grouped

In [10]:
train_group = groupby_model(train_df)
train_group.head()

Unnamed: 0,scenario,model,time,tasmin,model_sc
0,ssp126,ACCESS-CM2,2015-01-01,247.46608,ssp126_ACCESS-CM2
1,ssp126,ACCESS-CM2,2015-01-02,248.20949,ssp126_ACCESS-CM2
2,ssp126,ACCESS-CM2,2015-01-03,253.68387,ssp126_ACCESS-CM2
3,ssp126,ACCESS-CM2,2015-01-04,261.12805,ssp126_ACCESS-CM2
4,ssp126,ACCESS-CM2,2015-01-05,266.37738,ssp126_ACCESS-CM2


## Make encoder for train and val data

In [11]:
encoder = OneHotEncoder().fit(train_group)
fake_X = encoder.transform(train_group)

In [12]:
encoder.get_feature_names_out()

array(['scenario_ssp126', 'scenario_ssp245', 'scenario_ssp370', ...,
       'model_sc_ssp585_NorESM2-MM', 'model_sc_ssp585_TaiESM1',
       'model_sc_ssp585_UKESM1-0-L'], dtype=object)

Since OneHotEncoder also is tranforming the tasmin data, and not concatenating features as I would like, I will make a ColumnTransformer instead to treat the categorical vs numerical data separately.

In [13]:
OHE = OneHotEncoder(sparse_output=False)
scaler = StandardScaler()
transformer = ColumnTransformer([('time','passthrough',['time']),
                                 ('model_ssp', OHE, ['model_sc']),
                                ('scaled_tasmin', scaler, ['tasmin']),
],
                               verbose_feature_names_out=False).fit(train_group)
encoded_X = transformer.transform(train_group)

In [14]:
cols = transformer.get_feature_names_out()
cols

array(['time', 'model_sc_ssp126_ACCESS-CM2', 'model_sc_ssp126_ACCESS-ESM',
       'model_sc_ssp126_CMCC-ESM2', 'model_sc_ssp126_CNRM-CM6-1',
       'model_sc_ssp126_CNRM-ESM2-', 'model_sc_ssp126_CanESM5',
       'model_sc_ssp126_EC-Earth3', 'model_sc_ssp126_EC-Earth3-',
       'model_sc_ssp126_FGOALS-g3', 'model_sc_ssp126_GFDL-ESM4',
       'model_sc_ssp126_GISS-E2-1-', 'model_sc_ssp126_INM-CM4-8',
       'model_sc_ssp126_INM-CM5-0', 'model_sc_ssp126_KACE-1-0-G',
       'model_sc_ssp126_MIROC-ES2L', 'model_sc_ssp126_MRI-ESM2-0',
       'model_sc_ssp126_NorESM2-LM', 'model_sc_ssp126_NorESM2-MM',
       'model_sc_ssp126_TaiESM1', 'model_sc_ssp126_UKESM1-0-L',
       'model_sc_ssp245_ACCESS-CM2', 'model_sc_ssp245_ACCESS-ESM',
       'model_sc_ssp245_CMCC-ESM2', 'model_sc_ssp245_CNRM-CM6-1',
       'model_sc_ssp245_CNRM-ESM2-', 'model_sc_ssp245_CanESM5',
       'model_sc_ssp245_EC-Earth3', 'model_sc_ssp245_EC-Earth3-',
       'model_sc_ssp245_FGOALS-g3', 'model_sc_ssp245_GFDL-ESM4',
      

In [15]:
print(type(encoded_X)) # check to see if sparse
encoded_X.shape

<class 'numpy.ndarray'>


(204544, 82)

In [16]:
# Make transformed back into DataFrame
encoded_df = pd.DataFrame(encoded_X,columns = cols).set_index('time')
encoded_df.head()

Unnamed: 0_level_0,model_sc_ssp126_ACCESS-CM2,model_sc_ssp126_ACCESS-ESM,model_sc_ssp126_CMCC-ESM2,model_sc_ssp126_CNRM-CM6-1,model_sc_ssp126_CNRM-ESM2-,model_sc_ssp126_CanESM5,model_sc_ssp126_EC-Earth3,model_sc_ssp126_EC-Earth3-,model_sc_ssp126_FGOALS-g3,model_sc_ssp126_GFDL-ESM4,...,model_sc_ssp585_INM-CM4-8,model_sc_ssp585_INM-CM5-0,model_sc_ssp585_KACE-1-0-G,model_sc_ssp585_MIROC-ES2L,model_sc_ssp585_MRI-ESM2-0,model_sc_ssp585_NorESM2-LM,model_sc_ssp585_NorESM2-MM,model_sc_ssp585_TaiESM1,model_sc_ssp585_UKESM1-0-L,tasmin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.055544
2015-01-02,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.975577
2015-01-03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.386712
2015-01-04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.585961
2015-01-05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.021303


Now that we have it back as a DataFrame which tells us which tasmin is mapped to the SSP and model, we can extract the tasmin and replace the ones and zeros with actual values.
This will also get us to the goal of one row per date.

In [18]:
# Reshape to have one row per date
tasmin_lst = []
for col in cols[1:-1]:
    col_tasmin = encoded_df[encoded_df[col]==1]['tasmin']
    col_tasmin.name = col[9:] # cut out model_sc_
    tasmin_lst.append(col_tasmin)
X = pd.concat(tasmin_lst,axis=1)
print(X.shape)
X

(2557, 80)


Unnamed: 0_level_0,ssp126_ACCESS-CM2,ssp126_ACCESS-ESM,ssp126_CMCC-ESM2,ssp126_CNRM-CM6-1,ssp126_CNRM-ESM2-,ssp126_CanESM5,ssp126_EC-Earth3,ssp126_EC-Earth3-,ssp126_FGOALS-g3,ssp126_GFDL-ESM4,...,ssp585_GISS-E2-1-,ssp585_INM-CM4-8,ssp585_INM-CM5-0,ssp585_KACE-1-0-G,ssp585_MIROC-ES2L,ssp585_MRI-ESM2-0,ssp585_NorESM2-LM,ssp585_NorESM2-MM,ssp585_TaiESM1,ssp585_UKESM1-0-L
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,-3.055544,-0.904298,-0.981017,-0.826941,-0.897736,-0.959719,-0.107476,-0.001881,-0.998497,-0.822699,...,-1.087445,-1.255796,-0.834879,0.018528,-1.430208,,,,,
2015-01-02,-2.975577,-1.318453,-1.611789,-0.97222,-1.124216,-2.401353,-0.495113,0.509872,-0.720946,-1.261223,...,-1.0392,-1.474621,-0.010791,0.314026,-1.335377,,,,,
2015-01-03,-2.386712,-1.201491,-1.10211,-1.670542,-0.786091,-2.532323,-0.33882,-0.739066,-1.469309,-1.265208,...,-0.46143,-1.449601,0.820566,0.493921,-1.121599,,,,,
2015-01-04,-1.585961,-1.398695,-1.282497,-0.778747,-1.009523,-2.725303,0.308674,-1.406745,-0.942738,-1.289362,...,-1.193547,-0.604789,-0.896807,-0.057007,-0.925475,,,,,
2015-01-05,-1.021303,-1.310329,-1.426078,-0.778027,-0.719319,-1.8535,0.109894,-1.340289,-0.479681,-0.806949,...,-1.20892,0.284692,-1.087042,-0.594661,-1.449941,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,-1.170322,-1.199132,-1.5672,-0.780067,-1.211394,-1.842477,-1.490562,-1.656525,-0.939118,-0.803697,...,-0.641341,-1.54298,-1.058464,-1.873978,-1.217053,,,,,
2021-12-28,-0.85881,-1.061497,-1.489446,-0.441695,-1.403639,-1.495383,-0.524092,-0.240743,-0.991428,-0.871366,...,-1.350692,-1.453503,-1.049437,-1.925158,-1.114788,,,,,
2021-12-29,-1.590497,-0.477012,-1.285,-1.037215,-1.490033,-1.518323,0.163088,-1.73897,-1.717233,-0.342428,...,-1.960656,-1.419333,-1.658022,-1.601658,-0.7892,,,,,
2021-12-30,-1.225458,-0.900474,-1.164076,-1.068594,-1.121305,-1.148532,-0.883039,-2.150168,-2.134214,-1.056046,...,-2.239899,-1.149689,-1.436355,-1.571257,-0.88731,,,,,


In [None]:
# TODO: make into a pipeline to apply to val_df as well

# Tests with Index

In [19]:
date_idx_test = train_group.set_index('time')
# date_idx_test.index = pd.to_datetime(date_idx_test.index)
date_idx_test.head()

Unnamed: 0_level_0,scenario,model,tasmin,model_sc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,ssp126,ACCESS-CM2,247.46608,ssp126_ACCESS-CM2
2015-01-02,ssp126,ACCESS-CM2,248.20949,ssp126_ACCESS-CM2
2015-01-03,ssp126,ACCESS-CM2,253.68387,ssp126_ACCESS-CM2
2015-01-04,ssp126,ACCESS-CM2,261.12805,ssp126_ACCESS-CM2
2015-01-05,ssp126,ACCESS-CM2,266.37738,ssp126_ACCESS-CM2


In [20]:
date_idx_test.loc[pd.to_datetime('2015-01-01').date()]

Unnamed: 0_level_0,scenario,model,tasmin,model_sc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,ssp126,ACCESS-CM2,247.46608,ssp126_ACCESS-CM2
2015-01-01,ssp126,ACCESS-ESM,267.46512,ssp126_ACCESS-ESM
2015-01-01,ssp126,CMCC-ESM2,266.75190,ssp126_CMCC-ESM2
2015-01-01,ssp126,CNRM-CM6-1,268.18427,ssp126_CNRM-CM6-1
2015-01-01,ssp126,CNRM-ESM2-,267.52612,ssp126_CNRM-ESM2-
...,...,...,...,...
2015-01-01,ssp585,MRI-ESM2-0,,ssp585_MRI-ESM2-0
2015-01-01,ssp585,NorESM2-LM,,ssp585_NorESM2-LM
2015-01-01,ssp585,NorESM2-MM,,ssp585_NorESM2-MM
2015-01-01,ssp585,TaiESM1,,ssp585_TaiESM1


In [21]:
date_idx_test2 = date_idx_test.copy()
date_idx_test2.index = pd.to_datetime(date_idx_test.index)
date_idx_test2.loc['2015-01-01']

Unnamed: 0_level_0,scenario,model,tasmin,model_sc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,ssp126,ACCESS-CM2,247.46608,ssp126_ACCESS-CM2
2015-01-01,ssp126,ACCESS-ESM,267.46512,ssp126_ACCESS-ESM
2015-01-01,ssp126,CMCC-ESM2,266.75190,ssp126_CMCC-ESM2
2015-01-01,ssp126,CNRM-CM6-1,268.18427,ssp126_CNRM-CM6-1
2015-01-01,ssp126,CNRM-ESM2-,267.52612,ssp126_CNRM-ESM2-
...,...,...,...,...
2015-01-01,ssp585,MRI-ESM2-0,,ssp585_MRI-ESM2-0
2015-01-01,ssp585,NorESM2-LM,,ssp585_NorESM2-LM
2015-01-01,ssp585,NorESM2-MM,,ssp585_NorESM2-MM
2015-01-01,ssp585,TaiESM1,,ssp585_TaiESM1
