In [43]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Only for official daily data

In [77]:
X = pd.read_pickle("./data_for_models/X.pkl")
y = pd.read_pickle("./data_for_models/y.pkl")

In [78]:
len(X.columns)

38

In [79]:
def input_scale(X):
    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    X_t = imputer.fit_transform(X.values)
    X_t = scaler.fit_transform(X_t)
    return X_t

def input_(X):
    imputer = SimpleImputer(strategy='median')
    X_t = imputer.fit_transform(X.values)
    return X_t

In [80]:
X_t = input_scale(X)

## PCA

In [65]:
pca = PCA(n_components=0.97)
pca.fit(X_t)

PCA(copy=True, iterated_power='auto', n_components=0.97, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [68]:
pca.explained_variance_ratio_

array([0.57982941, 0.18173201, 0.06744471, 0.05989134, 0.04229991,
       0.03092114, 0.02000579])

In [31]:
np.isnan(X_t).sum()

0

### RFE

In [69]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE

In [72]:
l = LassoCV()
rfe = RFE(l, 10)
rfe.fit(X_t, y.values.ravel())

  y = column_or_1d(y, warn=True)


RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
  n_features_to_select=10, step=1, verbose=0)

In [82]:
X.columns[rfe.support_]

Index(['Tn_10', 'Tm_11', 'Tn_11', 'HRm_11', 'Tm_12', 'hPa_12', 'Tm_13',
       'Tn_13', 'HRm_13', 'hPa'],
      dtype='object')

In [81]:
rfe.ranking_

array([29,  6, 19,  1, 10,  7, 25, 15, 17, 16, 28, 23,  1,  2,  1,  1,  1,
       24, 13,  8, 11,  1,  4, 18, 12, 20, 21,  1,  5,  1,  9,  1,  1,  3,
       27, 14, 26, 22])

### LassoCV

In [83]:
from sklearn.linear_model import LassoCV

In [86]:
reg = LassoCV()
reg.fit(X_t, y.values.ravel())



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [89]:
X.columns[reg.coef_ != 0]

Index(['VVem6', 'VVx6', 'Tm_11', 'Tn_11', 'HRm_11', 'Tm_12', 'PPT24h_12',
       'hPa_12', 'DVum10_12', 'Tm_13', 'Tx_13', 'RS24h', 'VVem10_13',
       'VVx10_13', 'DVx10_13'],
      dtype='object')

## Select KBest

In [90]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

In [99]:
best = SelectKBest(score_func=f_regression, k=15)
best.fit(X_t, y.values.ravel())

SelectKBest(k=15, score_func=<function f_regression at 0x00000263548F4C80>)

In [100]:
X.columns[best.get_support()]

Index(['Tm_10', 'Tx_10', 'Tn_10', 'RS24h_11', 'Tm_11', 'Tx_11', 'Tn_11',
       'Tm_12', 'Tx_12', 'Tn_12', 'RS24h_12', 'Tm_13', 'Tx_13', 'Tn_13',
       'RS24h'],
      dtype='object')

## Corr

In [103]:
full = pd.read_pickle("./data_for_models/full.pkl")

In [105]:
full.corr()['T_MEAN']

ndays        0.426027
T_MEAN       1.000000
Tm_10        0.953280
Tx_10        0.914126
Tn_10        0.942967
PPT24h_11   -0.015305
HRm_10       0.309063
hPa_11       0.013052
RS24h_11     0.615177
VVem6       -0.278824
DVum6       -0.507218
VVx6        -0.307042
DVx6        -0.384771
Tm_11        0.952897
Tx_11        0.922844
Tn_11        0.939438
HRm_11       0.286854
Tm_12        0.954466
Tx_12        0.917949
Tn_12        0.948773
PPT24h_12   -0.022950
HRm_12       0.237169
hPa_12       0.013139
RS24h_12     0.610058
VVem10_12   -0.078832
DVum10_12   -0.312932
VVx10_12    -0.231088
DVx10_12    -0.376524
Tm_13        0.956570
Tx_13        0.932284
Tn_13        0.941362
PPT24h      -0.047343
HRm_13       0.065400
hPa          0.015546
RS24h        0.607388
VVem10_13   -0.325454
DVum10_13   -0.451695
VVx10_13    -0.320868
DVx10_13    -0.421184
Name: T_MEAN, dtype: float64

# With unofficial