In [1]:
# Importation des bilbioth√®ques
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Chargement des donn√©es
# nb de solutions : regrouper ?

url = './data_D/data_D.solution'
df = pd.read_csv(url, sep='\s+', header=None)
df.head()

#Teste si les colonnes contiennent des valeurs nulles
#df.columns[df.isnull().any()]
#df.shape

#def clean_data(data):
    #"""
    #Regarde si des nan sont pr√©sentes dans des colonnes, si oui, les supprimer.
    #D√©tecter s'il s'agit d'une classification ou d'une r√©gression et appliquer le bon algorithme, 
    #en choisissant les bons param√®tres.
    #Supprimer les variables peu informatives ou fortement cor√©l√©es (TD maths, corr√©lation accumulatives...).
    #Appliquer √©ventuellement des techniques de r√©duction dimensionnelle (PCA, t-SNE) pour visualiser et d√©cider
    #"""


Unnamed: 0,0
0,1
1,1
2,0
3,0
4,1


In [2]:
import pandas as pd
# Lecture des features (.data) dans un DataFrame pandas
data_df = pd.read_csv('./data_D/data_D.data', sep='\s+', header=None, na_values='NaN')
print("Dimensions des donn√©es du dossier A :", data_df.shape)
print(data_df.head())
# Lecture des cibles (.solution)
solution_df = pd.read_csv('./data_D/data_D.solution', sep='\s+', header=None)
print("Dimensions des solutions :", solution_df.shape)
print(solution_df.head())
# Lecture des types (.type)
with open('./data_D/data_D.type', 'r') as f:
    feature_types = [line.strip() for line in f.readlines()]
print("Nombre de features annonc√©es :", len(feature_types))
print("Exemples de types de features :", feature_types[:5])

Dimensions des donn√©es du dossier A : (2984, 144)
   0    1    2    3    4    5    6    7    8    9    ...  134  135  136  137  \
0    1    0    0    0    0    1    1    0    1    0  ...    1    0    1    0   
1    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    1   
2    0    0    0    0    0    1    0    0    0    0  ...    0    0    1    0   
3    0    0    0    0    0    1    0    0    1    0  ...    0    0    0    0   
4    0    0    0    0    0    0    1    0    1    0  ...    0    0    0    1   

   138  139  140  141  142  143  
0    1    0    1    0    0    0  
1    0    1    0    0    0    0  
2    0    1    0    0    0    1  
3    0    0    0    1    0    0  
4    0    0    0    1    0    0  

[5 rows x 144 columns]
Dimensions des solutions : (2984, 1)
   0
0  1
1  1
2  0
3  0
4  1
Nombre de features annonc√©es : 144
Exemples de types de features : ['Binary', 'Binary', 'Binary', 'Binary', 'Binary']


In [3]:
# D√©tection des valeurs manquantes par colonne
missing_counts = data_df.isna().sum()
print("Valeurs manquantes par colonne :")
print(missing_counts[missing_counts > 0])
# Exemple: s√©paration entra√Ænement/validation (80% entrainement et 20% validation)

from sklearn.model_selection import train_test_split
X = data_df

y = solution_df 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Taille du train set :", X_train.shape, "| Taille du set de validation :", X_val.shape)

Valeurs manquantes par colonne :
Series([], dtype: int64)
Taille du train set : (2387, 144) | Taille du set de validation : (597, 144)


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
# S√©paration des colonnes num√©riques et cat√©gorielles d‚Äôapr√®s feature_types
numeric_features = [i for i, t in enumerate(feature_types) if t == 'Numerical']
categorical_features = [i for i, t in enumerate(feature_types) if t == 'Categorical']
# Transformer pour les num√©riques: impute (moyenne) puis standardisation -> remplace NaN par la moyenne de la colonne
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])

# Transformer pour les cat√©gorielles : impute (remplace NaN par la cat√©gorie la plus fr√©quente) puis one-hot encode et ignore si cat√©gorie inconnue en test
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

# Combinaison des transformations par colonnes
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])
X_train_prepared = preprocessor.fit_transform(X_train)
X_val_prepared = preprocessor.transform(X_val)
print("Dimensions des features apr√®s pr√©traitement :", X_train_prepared.shape)

Dimensions des features apr√®s pr√©traitement : (2387, 8)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import warnings
#warnings.filterwarnings("ignore", category=UserWarning)  # Cache warnings mineurs


# ALIGNEMENT si n√©cessaire
if 'X_train_arr' not in locals():
    n_train = min(X_train.shape[0], len(y_train))
    X_train_arr = X_train[:n_train]
    y_train_arr = np.asarray(y_train)[:n_train].ravel()
    n_val = min(X_val.shape[0], len(y_val))
    X_val_arr = X_val[:n_val]
    y_val_arr = np.asarray(y_val)[:n_val].ravel()

multilabel = y_train_arr.ndim > 1

models = {
    'LogisticRegression': make_pipeline(
        StandardScaler(), 
        LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
    ),
    'RandomForest': RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)
}

best_model, best_name, best_score = None, None, -1
for name, clf in models.items():
    print(f"{name}")
    clf.fit(X_train_arr, y_train_arr)
    y_pred = clf.predict(X_val_arr)
    score = accuracy_score(y_val_arr, y_pred)
    print(f"Score: {score:.4f}")
    if score > best_score:
        best_score, best_name, best_model = score, name, clf

print(f"\nMeilleur : {best_name}:")
print(classification_report(y_val_arr, best_model.predict(X_val_arr)))
print("Confusion Matrix:")
print(confusion_matrix(y_val_arr, best_model.predict(X_val_arr)))

print(f"\nMeilleur : {best_name} ({best_score:.4f})")


‚è≥ LogisticRegression...
   Score: 0.7571
‚è≥ RandomForest...
   Score: 0.8023

üìä RandomForest:
              precision    recall  f1-score   support

           0       0.88      0.68      0.77       288
           1       0.76      0.91      0.83       309

    accuracy                           0.80       597
   macro avg       0.82      0.80      0.80       597
weighted avg       0.82      0.80      0.80       597

Confusion Matrix:
[[197  91]
 [ 27 282]]



In [6]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred_val = best_model.predict(X_val_arr)
if multilabel:
    print("Rapport de classification multi-label (par √©tiquette) :")
    for i in range(y_val_arr.shape[1]):
        print(f"√âtiquette {i}:")
        print(classification_report(y_val_arr[:, i], y_pred_val[:, i]))
else:
    print("Rapport de classification :")
    print(classification_report(y_val_arr, y_pred_val))
    print("Matrice de confusion :")
    print(confusion_matrix(y_val_arr, y_pred_val))

Rapport de classification :
              precision    recall  f1-score   support

           0       0.88      0.68      0.77       288
           1       0.76      0.91      0.83       309

    accuracy                           0.80       597
   macro avg       0.82      0.80      0.80       597
weighted avg       0.82      0.80      0.80       597

Matrice de confusion :
[[197  91]
 [ 27 282]]


In [7]:
# Poursuivant avec best_model obtenu
# Pr√©dictions sur le set de test (ici X_val sert d'exemple de nouveau set)
y_pred = best_model.predict(X_val_arr)
print("Quelques pr√©dictions :\n", y_pred[:5])

Quelques pr√©dictions :
 [1 1 1 1 1]


In [8]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

# Feature selection 
selector = SelectKBest(f_classif, k=min(50, X_train_arr.shape[1]))
X_train_sel = selector.fit_transform(X_train_arr, y_train_arr)
X_val_sel = selector.transform(X_val_arr)

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_sel, y_train_arr)
print(f"RF + Feature Selection: {accuracy_score(y_val_arr, rf.predict(X_val_sel)):.4f}")

# Feature importance
print("Top 5 features:", rf.feature_importances_.argsort()[-5:])


RF + Feature Selection: 0.8174
Top 5 features: [12 39 18 14 15]
