In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
def save_csv(df, path): Path(path).parent.mkdir(parents=True, exist_ok=True); df.to_csv(path, index=False)
def top_loadings(pca, feature_names, pc=0, topk=10): w = pca.components_[pc]; idx = np.argsort(np.abs(w))[::-1][:topk]; return pd.DataFrame({'feature': np.array(feature_names)[idx], 'loading': w[idx]})

X = pd.read_csv('x_train.csv')
y = pd.read_csv('y_train.csv').squeeze()
X_test = pd.read_csv('x_test.csv')
if 'date' in X.columns: X = X.sort_values('date'); y = y.loc[X.index]; X_test = X_test.sort_values('date')
split_idx = int(len(X)*0.8)
X_train, X_valid = X.iloc[:split_idx].reset_index(drop=True), X.iloc[split_idx:].reset_index(drop=True)
y_train, y_valid = y.iloc[:split_idx].reset_index(drop=True), y.iloc[split_idx:].reset_index(drop=True)



In [None]:
###Standardisation

scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)
X_test_std = scaler.transform(X_test)


In [None]:
### PCA -  réduction de dimension par PCA, mesure la variance expliquée, transforme les jeux de données et exporte les résultats interprétables

pca = PCA(n_components=0.95, svd_solver='full').fit(X_train_std)
evr = pd.DataFrame({'pc': np.arange(1, len(pca.explained_variance_ratio_)+1), 'explained_variance_ratio': pca.explained_variance_ratio_})
evr['cumulative'] = evr['explained_variance_ratio'].cumsum()
X_train_pca = pca.transform(X_train_std); X_valid_pca = pca.transform(X_valid_std); X_test_pca = pca.transform(X_test_std)
save_csv(pd.DataFrame(X_train_pca), 'data/processed/X_train_pca.csv')
save_csv(pd.DataFrame(X_valid_pca), 'data/processed/X_valid_pca.csv')
save_csv(pd.DataFrame(X_test_pca), 'data/processed/X_test_pca.csv')
save_csv(evr, 'reports/pca_explained_variance.csv')
tl = pd.concat([top_loadings(pca, X.columns, pc=i, topk=10).assign(pc=i+1) for i in range(min(3, pca.n_components_))], ignore_index=True)
save_csv(tl, 'reports/pca_top_loadings_pc1_3.csv')
print(f'PCA n_components_={pca.n_components_}, variance cumulée≈{evr.cumulative.iloc[-1]:.3f}')

In [None]:
### Sélection L1 - réduction par sélection de variables via pénalisation L1, puis exporte les features retenues et les matrices réduites


tscv = TimeSeriesSplit(n_splits=5)
base_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=5000, n_jobs=-1)
grid = {'C': np.logspace(-3, 2, 10)}
g = GridSearchCV(base_clf, grid, scoring='accuracy', cv=tscv, n_jobs=-1)
g.fit(X_train_std, y_train)
clf_l1 = g.best_estimator_.fit(X_train_std, y_train)
coef = np.abs(clf_l1.coef_).mean(axis=0)
selected = np.where(coef > 1e-8)
selected_features = X.columns[selected]
save_csv(pd.DataFrame({'feature': selected_features}), 'reports/feature_list_l1.csv')
save_csv(pd.DataFrame(X_train_std[:, selected], columns=selected_features), 'data/processed/X_train_l1.csv')
save_csv(pd.DataFrame(X_valid_std[:, selected], columns=selected_features), 'data/processed/X_valid_l1.csv')
save_csv(pd.DataFrame(X_test_std[:, selected], columns=selected_features), 'data/processed/X_test_l1.csv')
print(f'L1 a retenu {len(selected_features)} features.')


In [None]:
### PLS (option) - met en place une troisième réduction de dimension, supervisée, appelée PLS
grid_k = {'n_components': list(range(2, 21))}
pls = GridSearchCV(PLSRegression(scale=False), grid_k, scoring='neg_mean_squared_error', cv=tscv, n_jobs=-1)
pls.fit(X_train_std, y_train) # si la cible est binaire, vous pouvez mapper {0,1} en {-1,1} ou rester en régression + signe
pls_best = pls.best_estimator_
Z_train = pls_best.transform(X_train_std); Z_valid = pls_best.transform(X_valid_std); Z_test = pls_best.transform(X_test_std)
save_csv(pd.DataFrame(Z_train), 'data/processed/X_train_pls.csv')
save_csv(pd.DataFrame(Z_valid), 'data/processed/X_valid_pls.csv')
save_csv(pd.DataFrame(Z_test), 'data/processed/X_test_pls.csv')
print('PLS n_components=', pls_best.n_components)

In [None]:
### Evaluation rapide - compare rapidement la baseline sans réduction avec les trois réductions (PCA, L1, PLS) à
### l’aide d’une régression logistique standard, puis exporte un tableau récapitulatif

def eval_setting(Xtr, ytr, Xva, yva, name):
text
clf = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=5000)
text
clf.fit(Xtr, ytr); pred = clf.predict(Xva); acc = accuracy_score(yva, pred); return name, acc
results = []
results.append(eval_setting(X_train_std, y_train, X_valid_std, y_valid, 'Baseline-Std'))
results.append(eval_setting(X_train_pca, y_train, X_valid_pca, y_valid, 'PCA'))
if 'selected_features' in locals(): results.append(eval_setting(pd.DataFrame(X_train_std)[:, selected], y_train, pd.DataFrame(X_valid_std)[:, selected], y_valid, 'L1'))
if 'Z_train' in locals(): results.append(eval_setting(Z_train, y_train, Z_valid, y_valid, 'PLS'))
res = pd.DataFrame(results, columns=['setting','accuracy'])
save_csv(res, 'reports/results_cv_valid.csv'); res