In [20]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.calibration import LinearSVC
from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier, OutputCodeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay

# Charger les données
df=pd.read_csv('../data/new_dataset.csv')

# Fonction qui prends en argument un dataset, une liste de feature, la variable cible et un modèle
# Cette fonction s'occupe de tout: 
# - séparées les données en données d'entrainement et test
# - entraine le modèles 
# - retourne l'accuracy
def get_classification_score(dataset, list_of_features, label, model):
    temp = dataset.copy()   
    y = temp.pop(label)
    
    Xtrain, Xtest, ytrain, ytest = train_test_split(temp, y,test_size=0.25, random_state=0)

    Xtrain = Xtrain[list_of_features].values
    Xtest = Xtest[list_of_features].values
        
    if len(Xtrain.shape) < 2:
        Xtrain = Xtrain.reshape(-1, 1)
        
    if len(Xtest.shape) < 2:
        Xtest = Xtest.reshape(-1, 1)

    model.fit(Xtrain,ytrain)
    ypredit = model.predict(Xtest)

    # Afficher matrice de confusion
    # disp = ConfusionMatrixDisplay.from_estimator(
    #     model,
    #     Xtest,
    #     ytest,
    #     display_labels=y.unique(),
    #     cmap=plt.cm.Blues,
    # )
    # plt.show()

    return accuracy_score(ytest, ypredit)

# Modèle
tree = DecisionTreeClassifier(criterion='entropy')

def generate_random_features_list(all_features, max_features=-1):
    if max_features == -1:
        return all_features
    n_features = np.random.randint(1, max_features+1)
    return np.random.choice(all_features, size=n_features, replace=False)

all_features = ["Weight", "Height", "Gender", "Age", "family_history_with_overweight",
                    "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE",
                    "CALC", "MTRANS"]

tab_score = []
tab_feature = []

# Tester plusieurs fois avec différentes combinaison
for i in range(10):
    features = generate_random_features_list(all_features, len(all_features))
    score = get_classification_score(df, features, "NObeyesdad", tree)
    tab_score.append(score)
    tab_feature.append(features)
    print(f"{features} : {score}")

for i in range(10):
    print(tab_score[i], " avec les features suivantes : ", tab_feature[i])


['SMOKE' 'Height' 'family_history_with_overweight' 'FCVC' 'FAVC' 'Weight'
 'FAF' 'SCC' 'MTRANS' 'CH2O' 'Gender' 'Age' 'CALC'] : 0.9425287356321839
['CH2O' 'SMOKE' 'FAF' 'MTRANS' 'family_history_with_overweight' 'NCP'
 'Weight' 'CAEC' 'Height' 'Gender' 'Age' 'SCC' 'CALC' 'TUE' 'FAVC' 'FCVC'] : 0.9425287356321839
['NCP' 'Gender' 'Age' 'MTRANS' 'SMOKE' 'SCC' 'Weight' 'Height' 'CALC'
 'CH2O'] : 0.9348659003831418
['FAVC' 'Height' 'FCVC' 'Weight' 'Age' 'NCP' 'SCC' 'MTRANS' 'CH2O' 'SMOKE'
 'Gender' 'CAEC' 'family_history_with_overweight' 'TUE'] : 0.9386973180076629
['Weight' 'SCC' 'CH2O' 'FCVC' 'Height' 'NCP' 'Age' 'Gender'] : 0.9425287356321839
['MTRANS' 'family_history_with_overweight'] : 0.28735632183908044
['family_history_with_overweight' 'CH2O' 'MTRANS' 'CALC' 'CAEC' 'SCC'
 'FCVC'] : 0.5057471264367817
['Weight'] : 0.6628352490421456
['TUE' 'CALC' 'family_history_with_overweight' 'MTRANS' 'Weight'] : 0.7701149425287356
['SCC' 'Height' 'MTRANS' 'Weight' 'TUE' 'FAVC' 'Age' 'CH2O' 'CAEC'
