# Linear Regression Attempt

In [29]:
import sys

import xarray as xr
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [2]:
train_path = '../data/train/train_east_tasmin_CMIP6_projections.parquet'
val_path =  '../data/val/val_east_tasmin_CMIP6_projections.csv'

In [3]:
train_df = pd.read_parquet(train_path)
val_df = pd.read_csv(val_path,index_col=0)

In [4]:
# some quick fixes for the dfs
train_df = train_df.rename({'tsmin_df':'tasmin'},axis=1)

In [5]:
train_df.head()

Unnamed: 0,model,scenario,lat,lon,time,spatial_ref,tasmin
0,ACCESS-CM2,ssp126,35.125,-83.625,2015-01-01 12:00:00,0,251.63474
1,ACCESS-CM2,ssp126,35.125,-83.375,2015-01-01 12:00:00,0,252.6675
2,ACCESS-CM2,ssp126,35.125,-83.125,2015-01-01 12:00:00,0,251.1967
3,ACCESS-CM2,ssp126,35.375,-83.375,2015-01-01 12:00:00,0,251.56654
4,ACCESS-CM2,ssp126,35.375,-83.125,2015-01-01 12:00:00,0,249.87247


In [6]:
val_df.head()

Unnamed: 0,lat,model,scenario,time,lon,spatial_ref,tasmin
0,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.625,0,270.24707
1,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.375,0,271.58322
2,35.125,ACCESS-CM2,ssp126,2022-01-01 12:00:00,-83.125,0,270.1009
25,35.125,ACCESS-CM2,ssp126,2022-01-02 12:00:00,-83.625,0,269.40863
26,35.125,ACCESS-CM2,ssp126,2022-01-02 12:00:00,-83.375,0,270.4561


In [7]:
train_df.describe()

Unnamed: 0,lat,lon,spatial_ref,tasmin
count,22893140.0,22893140.0,22893136.0,22893140.0
mean,37.37967,-80.00206,0.0,185.9745
std,1.293378,1.564687,0.0,133.048
min,35.125,-83.625,0.0,0.0
25%,36.375,-81.125,0.0,0.0
50%,37.625,-79.625,0.0,273.0379
75%,38.625,-78.875,0.0,285.3169
max,39.625,-77.625,0.0,308.4099


In [9]:
train_df['model'].unique()

array(['ACCESS-CM2', 'ACCESS-ESM', 'CMCC-ESM2', 'CNRM-CM6-1',
       'CNRM-ESM2-', 'CanESM5', 'EC-Earth3', 'EC-Earth3-', 'FGOALS-g3',
       'GFDL-ESM4', 'GISS-E2-1-', 'INM-CM4-8', 'INM-CM5-0', 'KACE-1-0-G',
       'MIROC-ES2L', 'MRI-ESM2-0', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1',
       'UKESM1-0-L'], dtype=object)

In [38]:
def groupby_model(df):
    """Process the train and val dfs for encoding as input into ML methods
    
    Arg: df (DataFrame) of tasmin from NEX-GDDP-CMIP6 data
    Output: DataFrame grouped by model and scenario, compressing the entire polygon
    """
    # Some quick transforms
    df['tasmin']=df['tasmin'].replace({0:np.nan}) # 0K is absolute zero and assumed invalid input
    df['time']=pd.to_datetime(df['time']).dt.date # Only need date
    grouped = df.groupby(['scenario','model','time'])['tasmin'].min().reset_index() #Min is more efficient. Searching the whole area
    #make new categorial value
    grouped['model_sc']=grouped['scenario']+'_'+grouped['model']
    #make time a datetime feature
    grouped = grouped.set_index('time')
    return grouped

In [39]:
train_group = groupby_model(train_df)
train_group.head()

Unnamed: 0_level_0,scenario,model,tasmin,model_sc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,ssp126,ACCESS-CM2,247.46608,"ssp126 , ACCESS-CM2"
2015-01-02,ssp126,ACCESS-CM2,248.20949,"ssp126 , ACCESS-CM2"
2015-01-03,ssp126,ACCESS-CM2,253.68387,"ssp126 , ACCESS-CM2"
2015-01-04,ssp126,ACCESS-CM2,261.12805,"ssp126 , ACCESS-CM2"
2015-01-05,ssp126,ACCESS-CM2,266.37738,"ssp126 , ACCESS-CM2"


## Make encoder for train and val data

In [15]:
encoder = OneHotEncoder().fit(train_group)
X = encoder.transform(train_group)

In [22]:
encoder.get_feature_names_out()

array(['scenario_ssp126', 'scenario_ssp245', 'scenario_ssp370', ...,
       'tasmin_295.4312', 'tasmin_296.0836', 'tasmin_nan'], dtype=object)

Since OneHotEncoder also is tranforming the tasmin data, I will make a ColumnTransformer instead to treat the categorical vs numerical data separately.

In [42]:
OHE = OneHotEncoder()
scaler = StandardScaler()
transformer = ColumnTransformer([('model_ssp', OHE, ['model_sc']),
                                ('scaled_tasmin', scaler, ['tasmin'])]).fit(train_group)
X = transformer.transform(train_group)

In [43]:
transformer.get_feature_names_out()

array(['model_ssp__model_sc_ssp126 , ACCESS-CM2',
       'model_ssp__model_sc_ssp126 , ACCESS-ESM',
       'model_ssp__model_sc_ssp126 , CMCC-ESM2',
       'model_ssp__model_sc_ssp126 , CNRM-CM6-1',
       'model_ssp__model_sc_ssp126 , CNRM-ESM2-',
       'model_ssp__model_sc_ssp126 , CanESM5',
       'model_ssp__model_sc_ssp126 , EC-Earth3',
       'model_ssp__model_sc_ssp126 , EC-Earth3-',
       'model_ssp__model_sc_ssp126 , FGOALS-g3',
       'model_ssp__model_sc_ssp126 , GFDL-ESM4',
       'model_ssp__model_sc_ssp126 , GISS-E2-1-',
       'model_ssp__model_sc_ssp126 , INM-CM4-8',
       'model_ssp__model_sc_ssp126 , INM-CM5-0',
       'model_ssp__model_sc_ssp126 , KACE-1-0-G',
       'model_ssp__model_sc_ssp126 , MIROC-ES2L',
       'model_ssp__model_sc_ssp126 , MRI-ESM2-0',
       'model_ssp__model_sc_ssp126 , NorESM2-LM',
       'model_ssp__model_sc_ssp126 , NorESM2-MM',
       'model_ssp__model_sc_ssp126 , TaiESM1',
       'model_ssp__model_sc_ssp126 , UKESM1-0-L',
       'mode

In [47]:
X.shape

(204544, 81)