In [7]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import linregress
from typing import Union
import pandas as pd

X, y, tcoef = make_regression(n_samples=100, n_features=1000, n_informative=2, noise=30.0,
                              coef=True, 
                              random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

basemodel = LinearRegression()
basemodel.fit(X_train, y_train)

r2_score(y_test, basemodel.predict(X_test))

-0.33727333443648866

In [18]:
def linear_time_analysis1(data: pd.DataFrame, age: Union[np.ndarray, pd.Series]) -> pd.DataFrame:
    def _fit_feature(y, x):
        idx = np.isfinite(x) #y should always be finite
        y = y[idx] 
        x = x[idx]
        s, i, r, p, serr = linregress(y, x)
        x_cap = y * s + i
        r2 = r2_score(x, x_cap)
        rss = (np.square(x_cap - x)).sum()
        rse = np.sqrt(rss / (x.shape[0] - 2))
        #secondary analysis
        residuals = np.abs(x - x_cap)
        s_res, i_res, r_res, p_res, _ = linregress(y, residuals)
        return s, i, r, p, serr, rse, r2, s_res, i_res, r_res, p_res
    
    #TODO: solve problem with mapply
    # mapply.init(n_workers=self.n_jobs, chunk_size=100, max_chunks_per_worker=10, progressbar=False)
    return data.apply(lambda x: _fit_feature(age, x), result_type='expand').reset_index(drop=True).rename(index={
                                                                                        0: 'slope', 
                                                                                        1: 'intercept', 
                                                                                        2: 'rvalue', 
                                                                                        3: 'p-value', 
                                                                                        4: 'stderr', 
                                                                                        5: 'rse',
                                                                                        6: 'r2',
                                                                                        7: 's_res',
                                                                                        8: 'i_res',
                                                                                        9: 'r_res',
                                                                                        10: 'p_res'
                                                                                        }).T


def linear_time_analysis2(data: pd.DataFrame, age: Union[np.ndarray, pd.Series]) -> pd.DataFrame:
    def _fit_feature(x):
        y = age.copy()
        idx = np.isfinite(x) #y should always be finite
        y = y[idx] 
        x = x[idx]
        s, i, r, p, serr = linregress(y, x)
        x_cap = y * s + i
        r2 = r2_score(x, x_cap)
        rss = (np.square(x_cap - x)).sum()
        rse = np.sqrt(rss / (x.shape[0] - 2))
        #secondary analysis
        residuals = np.abs(x - x_cap)
        s_res, i_res, r_res, p_res, _ = linregress(y, residuals)
        return s, i, r, p, serr, rse, r2, s_res, i_res, r_res, p_res
    
    #TODO: solve problem with mapply
    # mapply.init(n_workers=self.n_jobs, chunk_size=100, max_chunks_per_worker=10, progressbar=False)
    return data.apply(_fit_feature, result_type='expand').reset_index(drop=True).rename(index={
                                                                                        0: 'slope', 
                                                                                        1: 'intercept', 
                                                                                        2: 'rvalue', 
                                                                                        3: 'p-value', 
                                                                                        4: 'stderr', 
                                                                                        5: 'rse',
                                                                                        6: 'r2',
                                                                                        7: 's_res',
                                                                                        8: 'i_res',
                                                                                        9: 'r_res',
                                                                                        10: 'p_res'
                                                                                        }).T

In [9]:
from enum import Enum
Color = Enum('Color', ['RED', 'GREEN', 'BLUE'])

Color['Red']

KeyError: 'Red'

In [21]:
import pandas as pd

data = pd.DataFrame(X)

%timeit linear_time_analysis2(data, y)

1.44 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
(linear_time_analysis2(data, y) == linear_time_analysis1(data, y)).all()

slope        True
intercept    True
rvalue       True
p-value      True
stderr       True
rse          True
r2           True
s_res        True
i_res        True
r_res        True
p_res        True
dtype: bool

In [22]:
%timeit linear_time_analysis1(data, y)

1.44 s ± 24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [101]:
np.linalg.norm(V, axis=0)

array([1., 1.])

In [99]:
L.T @ V

array([[ 1.00000000e+00, -2.38076261e-01],
       [ 1.06685494e-16,  1.00000000e+00]])

In [98]:
X_test_scaled = (X_test - model._x_mean) / model._x_std 
V = model.x_weights_
L = model.x_loadings_
R = V @ np.linalg.pinv(L.T @ V)

y_pred_test_untr = X_test_scaled @ R # PLS prediction
# y_pred_test_kdm = X_test_scaled @ V / (V.T @ V) # KDM prediction

In [86]:
# np.isclose(y_pred_test_untr.ravel(), y_pred_test_kdm.ravel())

In [88]:
# np.corrcoef(y_pred_test_untr.ravel(), model.predict(X_test).ravel() )

In [43]:
np.corrcoef(y_pred_test_2.ravel(), y_pred_test_untr.ravel())

array([[1., 1.],
       [1., 1.]])

In [33]:
np.corrcoef(V.ravel(), L.ravel())

array([[1.        , 0.96908728],
       [0.96908728, 1.        ]])

array([[1., 1.],
       [1., 1.]])

In [14]:
model.x_weights_

array([[ 0.05364706],
       [ 0.0033299 ],
       [ 0.60336176],
       [ 0.07792432],
       [ 0.00109431],
       [ 0.7887291 ],
       [ 0.02793724],
       [-0.01935501],
       [-0.00765809],
       [ 0.06070222]])

In [15]:
model.x_loadings_

array([[ 0.09381422],
       [ 0.01737908],
       [ 0.59791032],
       [ 0.13438696],
       [ 0.00802706],
       [ 0.77580919],
       [ 0.08062372],
       [-0.03558635],
       [-0.0427291 ],
       [ 0.1400372 ]])