In [115]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.calibration import LinearSVC
from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier, OutputCodeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay

# charger les données
df=pd.read_csv('../data/new_dataset.csv')

def get_classification_score(dataset, list_of_features, label, model):
    temp = dataset.copy()   
    y = temp.pop(label)
    
    Xtrain, Xtest, ytrain, ytest = train_test_split(temp, y,test_size=0.25, random_state=0)

    Xtrain = Xtrain[list_of_features].values
    Xtest = Xtest[list_of_features].values
        
    if len(Xtrain.shape) < 2:
        Xtrain = Xtrain.reshape(-1, 1)
        
    if len(Xtest.shape) < 2:
        Xtest = Xtest.reshape(-1, 1)

    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)
        
    model.fit(Xtrain,ytrain)

    ypredit = model.predict(Xtest)

    # disp = ConfusionMatrixDisplay.from_estimator(
    #     model,
    #     Xtest,
    #     ytest,
    #     display_labels=y.unique(),
    #     cmap=plt.cm.Blues,
    # )

    # print(disp.confusion_matrix)
    # plt.show()
    
    # print(classification_report(ytest, ypredit))
    # print(confusion_matrix(ytest, ypredit))
    return accuracy_score(ytest, ypredit)

# Name model
tree = DecisionTreeClassifier(random_state=5)
ovo_tree = OneVsOneClassifier(tree)
ovr_tree = OneVsRestClassifier(tree)
ecoc = OutputCodeClassifier(tree, code_size=2)

# Feature scaling

linear_svc = LinearSVC(
    multi_class='ovr', 
    dual=False, 
    max_iter=15000,
    C=12)

# param_grid = {"max_depth": [3, 5, 8]}
# tree_optimized = GridSearchCV(tree, param_grid=param_grid, cv=3)
# ovo_tree = OneVsOneClassifier(tree_optimized)
# ovr_tree = OneVsRestClassifier(tree_optimized)
# ecoc = OutputCodeClassifier(tree_optimized, code_size=2)

# print(get_classification_score(df, ["Weight", "Height", "family_history_with_overweight", "CAEC", ], "NObeyesdad", linear_svc))
# print(get_classification_score(df, "", "NObeyesdad", ovo_tree))
# print(get_classification_score(df, "", "NObeyesdad", ovr_tree))
# print(get_classification_score(df, "", "NObeyesdad", ecoc))
# print(get_classification_score(df, "", "NObeyesdad", model))

def generate_random_features_list(all_features, max_features=-1):
    if max_features == -1:
        return all_features
    n_features = np.random.randint(1, max_features+1)
    return np.random.choice(all_features, size=n_features, replace=False)

all_features = ["Weight", "Height", "Gender", "Age", "family_history_with_overweight",
                    "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE",
                    "CALC", "MTRANS"]

tab_score = []
tab_feature = []

for i in range(30):
    features = generate_random_features_list(all_features, len(all_features))
    score = get_classification_score(df, features, "NObeyesdad", linear_svc)
    tab_score.append(score)
    tab_feature.append(features)
    # print(f"{features} : {score}")

print(tab_score)

tab_score = []
tab_feature = []

for i in range(30):
    features = generate_random_features_list(all_features, len(all_features))
    score = get_classification_score(df, features, "NObeyesdad", tree)
    tab_score.append(score)
    tab_feature.append(features)
    # print(f"{features} : {score}")

for i in range(10):
    print(tab_score[i], " avec les features suivantes : ", tab_feature[i])


[0.22348484848484848, 0.75, 0.7765151515151515, 0.5681818181818182, 0.4185606060606061, 0.3522727272727273, 0.44507575757575757, 0.5359848484848485, 0.2897727272727273, 0.2897727272727273, 0.7935606060606061, 0.5984848484848485, 0.7178030303030303, 0.7859848484848485, 0.7386363636363636, 0.36363636363636365, 0.32575757575757575, 0.26515151515151514, 0.7178030303030303, 0.7859848484848485, 0.5189393939393939, 0.5018939393939394, 0.7462121212121212, 0.5757575757575758, 0.19128787878787878, 0.7367424242424242, 0.49242424242424243, 0.509469696969697, 0.5303030303030303, 0.7859848484848485]
0.6231060606060606  avec les features suivantes :  ['Age' 'CH2O' 'FAF' 'SCC' 'MTRANS' 'Height'
 'family_history_with_overweight']
0.9412878787878788  avec les features suivantes :  ['family_history_with_overweight' 'TUE' 'Weight' 'FAF' 'SCC' 'CAEC'
 'Gender' 'Age' 'Height' 'SMOKE']
0.8238636363636364  avec les features suivantes :  ['FAVC' 'FCVC' 'SCC' 'Gender' 'Weight' 'family_history_with_overweight'
 