# Import Lib & Data


In [None]:
import numpy as np
import pandas as pd
import scipy.io as sio
import tensorly as tl
from tensorly.decomposition import parafac
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score

In [1]:
import scipy.io as sio
import pandas as pd

df_mat = sio.loadmat("Acar_data/EEM_NMR_LCMS.mat")
df_cts = pd.read_csv("Acar_data/concentrations.txt", delim_whitespace=True)

  df_cts = pd.read_csv("Acar_data/concentrations.txt", delim_whitespace=True)


In [2]:
df_cts.head(3)

Unnamed: 0,Val-Tyr-Val,Trp-Gly,Phe,Malto,Propanol
1,5.0,0.0,0.0,0.0,0.0
2,0.0,5.0,0.0,0.0,0.0
3,0.0,0.0,5.0,0.0,0.0


In [3]:
print(df_mat.keys())
print(df_mat["X"].dtype.names)

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'X', 'Z'])
('name', 'type', 'author', 'date', 'moddate', 'imagesize', 'imagemode', 'data', 'label', 'axisscale', 'imageaxisscale', 'title', 'class', 'include', 'classlookup', 'axistype', 'imageaxistype', 'description', 'userdata', 'datasetversion', 'history', 'uniqueid')


In [4]:
data_dic = {}
dimensions = ["X", "Y", "Z"]
for dim in dimensions:
    mesurement_technique = df_mat[dim]["name"][0][0][0]
    data = df_mat[dim]["data"][0][0]
    data_dic[mesurement_technique] = data
    print(f"Extracted {mesurement_technique} data with shape {data.shape}")

Extracted EEM data with shape (28, 251, 21)
Extracted 3-way NMR data with shape (28, 13324, 8)
Extracted LCMS data with shape (28, 168)


In [None]:
SEED = 42
np.random.seed(SEED)

CP_RANK_NMR = 3
CP_RANK_EEM = 2
PCA_RANK_LCMS = 5


# Factorisation CP pour NMR et EEM
def cp_scores(tensor, rank):
    """
    Applique PARAFAC à un tenseur 3D et retourne les scores des individus (mode 0).
    """
    print(f"Tenseur shape avant CP: {tensor.shape}")

    # Check que le mode 0 = individus (n = 28)
    if tensor.shape[0] != 28:
        raise ValueError(
            "Le mode 0 n'est pas celui des individus. Tenseur mal orienté."
        )

    # PARAFAC
    # factors = parafac(tensor, rank=rank, init='svd', random_state=SEED)
    factors = parafac(
        tensor, rank=rank, n_iter_max=200, tol=1e-6, init="random", random_state=SEED
    )
    for i, f in enumerate(factors):
        print(f"{i}. Shape du facteur {i} : {f[0].shape}")

    print(len(factors[1]))
    print(f"Shape des scores: {factors[1][0].shape}")
    return tl.to_numpy(factors[1][0])


X_eem = data_dic["EEM"]
X_nmr = data_dic["3-way NMR"]
X_lcms = data_dic["LCMS"]

print("-" * 20, "Performing CP decomposition", "-" * 20)

scores_nmr = cp_scores(X_nmr, CP_RANK_NMR)
scores_eem = cp_scores(X_eem, CP_RANK_EEM)

#  PCA LCMS
pca_lcms = PCA(n_components=PCA_RANK_LCMS, random_state=SEED)
scores_lcms = pca_lcms.fit_transform(X_lcms)

# Fusion
if scores_nmr.shape[0] != scores_lcms.shape[0]:
    scores_nmr = scores_nmr.T
if scores_eem.shape[0] != scores_lcms.shape[0]:
    scores_eem = scores_eem.T

print("-" * 50)
print(f"Shape scores NMR: {scores_nmr.shape}")
print(f"Shape scores EEM: {scores_eem.shape}")
print(f"Shape scores LCMS: {scores_lcms.shape}")
X_fused = np.hstack([scores_nmr, scores_eem, scores_lcms])
print(f"Shape finale des features fusionnés: {X_fused.shape}")


df_fused = pd.DataFrame(
    data=X_fused,
    columns=[f"NMR_{i}" for i in range(scores_nmr.shape[1])]
    + [f"EEM_{i}" for i in range(scores_eem.shape[1])]
    + [f"LCMS_{i}" for i in range(scores_lcms.shape[1])],
)
df_fused.head(3)

-------------------- Performing CP decomposition --------------------
Tenseur shape avant CP: (28, 13324, 8)
0. Shape du facteur 0 : ()
[1. 1. 1.]
1. Shape du facteur 1 : (28, 3)
[array([[2771123.68126169, 1123300.40883946,  614127.12369003],
       [-469476.82531928,   32392.8980954 , -164881.71149641],
       [ 226634.977006  ,  422714.21112521,  -57268.22290384],
       [2281649.45600227,  361289.64391343, 1425434.12319328],
       [ 677473.52293271,  339838.60887797,  119542.00376569],
       [1765591.19829086,  884329.47966739,  844133.1304138 ],
       [2584885.73700723, 1353053.88278702,  621121.01841184],
       [1284282.48092858,  857289.19089584,  340788.00510229],
       [3680272.9797122 , 1572969.49420901, 1279351.60269393],
       [1888226.35296774, 1314583.61364242,  256076.54189985],
       [3728523.71576975, 1688894.42564266,  774864.52810753],
       [ 917398.89237088,  871687.857425  ,   47602.59077615],
       [1075726.74846673,  973518.23746692,   84689.46509751],
 

Unnamed: 0,NMR_0,NMR_1,NMR_2,EEM_0,EEM_1,LCMS_0,LCMS_1,LCMS_2,LCMS_3,LCMS_4
0,2771124.0,1123300.0,614127.12369,,,10235.58422,-7136.174505,-4643.34219,856.156766,-785.105793
1,-469476.8,32392.9,-164881.711496,,,-13954.243357,2954.078675,-6345.742206,-432.799853,-694.762541
2,226635.0,422714.2,-57268.222904,,,-13407.111172,-819.118841,6149.323188,408.030989,-538.782346


In [19]:
scores_eem

array([[nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan]])

In [None]:
concentrations = pd.read_csv("Acar_data/concentrations.txt", sep=r"\s+", index_col=0)
y = concentrations.values
print(f"Shape des concentrations (y): {y.shape}")

# Standardisation et régression Ridge
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_fused)

alphas = np.logspace(-3, 3, 100)
model = RidgeCV(alphas=alphas, cv=5)

# Validation croisée
scores = cross_val_score(model, X_scaled, y, cv=5, scoring="r2")
print(f"R² moyen en validation croisée : {scores.mean():.4f}")