In [3]:
import numpy as np
import sys
import pandas as pd
import scipy.io as sio
import seaborn as sns
import tensorly as tl
from tensorly.decomposition import parafac
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score

In [None]:
sys.path.append("../")
from src.data_processing import DataProcessing

In [9]:
df_mat = sio.loadmat("../Acar_data/EEM_NMR_LCMS.mat")
df_cts = pd.read_csv("../Acar_data/concentrations.txt", delim_whitespace=True)

dp = DataProcessing(df_mat, df_cts)
X_eem, X_nmr, X_lcms = dp.get_processed_data()

Extracted EEM data with shape (28, 251, 21)
Extracted 3-way NMR data with shape (28, 13324, 8)
Extracted LCMS data with shape (28, 168)


  df_cts = pd.read_csv("../Acar_data/concentrations.txt", delim_whitespace=True)


# PARAFAC


In [None]:
SEED = 42
np.random.seed(SEED)

CP_RANK_NMR = 3
CP_RANK_EEM = 3
PCA_RANK_LCMS = 5


# Factorisation CP pour NMR et EEM
def cp_scores(tensor, rank):
    """
    Applique PARAFAC à un tenseur 3D et retourne les scores des individus (mode 0).
    """
    print(f"Tenseur shape avant CP: {tensor.shape}")

    # Check que le mode 0 = individus (n = 28)
    if tensor.shape[0] != 28:
        raise ValueError(
            "Le mode 0 n'est pas celui des individus. Tenseur mal orienté."
        )

    # PARAFAC
    # factors = parafac(tensor, rank=rank, init='svd', random_state=SEED)
    factors = parafac(
        tensor, rank=rank, n_iter_max=2000, tol=1e-6, init="random", random_state=SEED
    )
    for i, f in enumerate(factors):
        print(f"{i}. Shape du facteur {i} : {f[0].shape}")

    return tl.to_numpy(factors[1][0])


print("-" * 20, "Performing CP decomposition", "-" * 20)

scores_nmr = cp_scores(X_nmr, CP_RANK_NMR)
scores_eem = cp_scores(X_eem_imputed, CP_RANK_EEM)

#  PCA LCMS
pca_lcms = PCA(n_components=PCA_RANK_LCMS, random_state=SEED)
scores_lcms = pca_lcms.fit_transform(X_lcms)

# Fusion
if scores_nmr.shape[0] != scores_lcms.shape[0]:
    scores_nmr = scores_nmr.T
if scores_eem.shape[0] != scores_lcms.shape[0]:
    scores_eem = scores_eem.T

print("-" * 50)
print(f"Shape scores NMR: {scores_nmr.shape}")
print(f"Shape scores EEM: {scores_eem.shape}")
print(f"Shape scores LCMS: {scores_lcms.shape}")
X_fused = np.hstack([scores_nmr, scores_eem, scores_lcms])
print(f"Shape finale des features fusionnés: {X_fused.shape}")


df_fused = pd.DataFrame(
    data=X_fused,
    columns=[f"NMR_{i}" for i in range(scores_nmr.shape[1])]
    + [f"EEM_{i}" for i in range(scores_eem.shape[1])]
    + [f"LCMS_{i}" for i in range(scores_lcms.shape[1])],
)
df_fused.head(3)

-------------------- Performing CP decomposition --------------------
Tenseur shape avant CP: (28, 13324, 8)
0. Shape du facteur 0 : ()
1. Shape du facteur 1 : (28, 3)
Tenseur shape avant CP: (28, 251, 21)
0. Shape du facteur 0 : ()
1. Shape du facteur 1 : (28, 3)
--------------------------------------------------
Shape scores NMR: (28, 3)
Shape scores EEM: (28, 3)
Shape scores LCMS: (28, 5)
Shape finale des features fusionnés: (28, 11)


Unnamed: 0,NMR_0,NMR_1,NMR_2,EEM_0,EEM_1,EEM_2,LCMS_0,LCMS_1,LCMS_2,LCMS_3,LCMS_4
0,2771124.0,1123300.0,614127.12369,638.250248,41554.856346,-326.491362,10235.58422,-7136.174505,-4643.34219,856.156766,-785.105793
1,-469476.8,32392.9,-164881.711496,40904.391632,-4972.918327,-1914.485786,-13954.243357,2954.078675,-6345.742206,-432.799853,-694.762541
2,226635.0,422714.2,-57268.222904,256.635381,4473.563479,89439.454508,-13407.111172,-819.118841,6149.323188,408.030989,-538.782346


In [None]:
concentrations = pd.read_csv("Acar_data/concentrations.txt", sep=r"\s+", index_col=0)
y = concentrations.values
print(f"Shape des concentrations (y): {y.shape}")

# Standardisation et régression Ridge
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_fused)

alphas = np.logspace(-3, 3, 100)
model = RidgeCV(alphas=alphas, cv=5)

# Validation croisée
scores = cross_val_score(model, X_scaled, y, cv=5, scoring="r2")
print(f"R² moyen en validation croisée : {scores.mean():.4f}")

Shape des concentrations (y): (28, 5)
R² moyen en validation croisée : 0.8554
