# Import Lib & Data


In [None]:
import scipy.io as sio
import pandas as pd

df_mat = sio.loadmat("Acar_data/EEM_NMR_LCMS.mat")
df_cts = pd.read_csv("Acar_data/concentrations.txt", delim_whitespace=True)

In [64]:
df_cts.head(3)

Unnamed: 0,Val-Tyr-Val,Trp-Gly,Phe,Malto,Propanol
1,5.0,0.0,0.0,0.0,0.0
2,0.0,5.0,0.0,0.0,0.0
3,0.0,0.0,5.0,0.0,0.0


In [None]:
print(df_mat.keys())
print(df_mat["X"].dtype.names)

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'X', 'Z'])
('name', 'type', 'author', 'date', 'moddate', 'imagesize', 'imagemode', 'data', 'label', 'axisscale', 'imageaxisscale', 'title', 'class', 'include', 'classlookup', 'axistype', 'imageaxistype', 'description', 'userdata', 'datasetversion', 'history', 'uniqueid')


In [None]:
data_dic = {}
dimensions = ["X", "Y", "Z"]
for dim in dimensions:
    mesurement_technique = df_mat[dim]["name"][0][0][0]
    data = df_mat[dim]["data"][0][0]
    data_dic[mesurement_technique] = data
    print(f"Extracted {mesurement_technique} data with shape {data.shape}")

Extracted EEM data with shape (28, 251, 21)
Extracted 3-way NMR data with shape (28, 13324, 8)
Extracted LCMS data with shape (28, 168)


In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import tensorly as tl
from tensorly.decomposition import parafac
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score


SEED = 42
np.random.seed(SEED)


CP_RANK_NMR = 3
CP_RANK_EEM = 2
PCA_RANK_LCMS = 5

# Data
df_mat = sio.loadmat("Acar_data/EEM_NMR_LCMS.mat")
data_dic = {}

dimensions = ["X", "Y", "Z"]
for dim in dimensions:
    mesurement_technique = df_mat[dim]["name"][0][0][0]
    data = df_mat[dim]["data"][0][0]
    data_dic[mesurement_technique] = data
    print(f"Extracted {mesurement_technique} data with shape {data.shape}")

# ind 27
for key in data_dic:
    if data_dic[key].shape[0] == 29:
        data_dic[key] = np.delete(data_dic[key], 26, axis=0)
        print(f"Removed sample 27 from {key}, new shape: {data_dic[key].shape}")

# Factorisation CP pour NMR et EEM
def cp_scores(tensor, rank):
    """
    Applique PARAFAC à un tenseur 3D et retourne les scores des individus.
    """
    # S'assurer que le mode 0 est bien les individus (n=28)
    if tensor.shape[0] != 28:
        tensor = tl.transpose(tensor, (2, 1, 0)) if tensor.shape[2] == 28 else tl.transpose(tensor, (1, 2, 0))
    
    # factors = parafac(tensor, rank=rank, init='svd', random_state=SEED)
    factors = parafac(tensor, rank=rank, n_iter_max=200, tol=1e-6, init='random', random_state=SEED)
    return tl.to_numpy(factors[0])  # mode 0 = individus


X_eem = data_dic["EEM"]
X_nmr = data_dic["3-way NMR"]
X_lcms = data_dic["LCMS"]

scores_nmr = cp_scores(X_nmr, CP_RANK_NMR)
scores_eem = cp_scores(X_eem, CP_RANK_EEM)

#  PCA LCMS 
pca_lcms = PCA(n_components=PCA_RANK_LCMS, random_state=SEED)
scores_lcms = pca_lcms.fit_transform(X_lcms)

# Fusion
if scores_nmr.shape[0] != scores_lcms.shape[0]:
    scores_nmr = scores_nmr.T
if scores_eem.shape[0] != scores_lcms.shape[0]:
    scores_eem = scores_eem.T

print(f"Shape scores NMR: {scores_nmr.shape}")
print(f"Shape scores EEM: {scores_eem.shape}")
print(f"Shape scores LCMS: {scores_lcms.shape}")
X_fused = np.hstack([scores_nmr, scores_eem, scores_lcms])
print(f"Shape finale des features fusionnés: {X_fused.shape}")


concentrations = pd.read_csv("Acar_data/concentrations.txt", sep=r"\s+", index_col=0)
concentrations = concentrations.drop(index=27)
y = concentrations.values
print(f"Shape des concentrations (y): {y.shape}")

# Standardisation et régression Ridge
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_fused)

alphas = np.logspace(-3, 3, 100)
model = RidgeCV(alphas=alphas, cv=5)

# Validation croisée
scores = cross_val_score(model, X_scaled, y, cv=5, scoring="r2")
print(f"R² moyen en validation croisée : {scores.mean():.4f}")

✔️ Extracted EEM data with shape (28, 251, 21)
✔️ Extracted 3-way NMR data with shape (28, 13324, 8)
✔️ Extracted LCMS data with shape (28, 168)
Shape scores NMR: (3,)
Shape scores EEM: (2,)
Shape scores LCMS: (28, 5)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 2 has 2 dimension(s)