### groupe
Arifette Nassim  
Derathe Pierre   
De Amorim Matthias  
Mesbah Slimane

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, confusion_matrix, classification_report


In [None]:
header = ["id", "age", "gender", "education", "country", "ethnicity", "neuroticism", "extraversion", \
          "openness", "agreeableness", "conscientiousness", "impulsiveness", "sensation_seeking", \
          "alcohol", "amphet", "amyl", "benzos", "caff", "cannabis", "choc", "coke", "crack", "ecstasy", \
          "heroin", "ketamine", "legallh", "lsd", "meth", "mushroom", "nicotine", "semer", "vsa"]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data", names=header)
df

In [None]:
semer_consummer_index = df.loc[df["semer"] != "CL0"].index

print("Nombre de personnes ayant indiqué avoir consommé la drogue fictive : ", semer_consummer_index.size)

df.drop(index = semer_consummer_index, inplace = True)

In [None]:
df_cannabis = df.loc[:,["age", "gender", "education", "country", "neuroticism", "extraversion", \
          "openness", "agreeableness", "conscientiousness", "impulsiveness", "sensation_seeking", \
          "cannabis"]]
# On ne garde pas ethnicity
df_cannabis

#### On change le type de la colonne cannabis  
-1 signifie que la personne n'est pas consommatrice, 1 signifie qu'elle l'est.
Les valeurs CLx indique quand a été la dernière prise de cette drogue par la personne.
Nous avons choisi de fixer le seuil à partir duquel on considère que la personne n'est plus consommatrice à 1 an.
Si elle n'a pas consommé depuis 1 an alors cette personne n'est pas consommatrice.

In [None]:
def CL_to_binary_class(cl):
    if(cl=="CL0" or cl=="CL1" or cl=="CL2"):
        return -1
    else:
        return 1

df_cannabis["cannabis"] = df_cannabis["cannabis"].apply(CL_to_binary_class)
df_cannabis

#### On renomme les valeurs des colonnes *age*, *gender*, *education* et *country* pour rendre plus lisible les noms des colonnes après avoir appliqué le one-hot

In [None]:
age = {-0.95197:"18-24", -0.07854:"25-34", 0.49788:"35-44", 1.09449:"45-54", 1.82213:"55-64", 2.59171:"65+"}
gender = {-0.48246:"male", 0.48246:"female"}
education = {-2.43591:"left school before 16", -1.7379:"left school at 16", -1.43719:"left school at 17", -1.22751:"left school at 18", -0.61113:"left college without degree", -0.05921:"professionnal certificate", 0.45468:"university degree", 1.16365:"master degree", 1.98437:"doctorate degree"}
country = {-0.57009:"USA", -0.46841:"New Zealand", -0.28519:"Other", -0.09765:"Australia", 0.21128:"Republic of Ireland", 0.24923:"Canada", 0.96082:"UK"}

def rename_age(x):
    return age[x]

def rename_gender(x):
    return gender[x]

def rename_education(x):
    return education[x]

def rename_country(x):
    return country[x]

df_cannabis["age"] = df_cannabis.loc[:, "age"].apply(rename_age)
df_cannabis["gender"] = df_cannabis.loc[:, "gender"].apply(rename_gender)
df_cannabis["education"] = df_cannabis.loc[:, "education"].apply(rename_education)
df_cannabis["country"] = df_cannabis.loc[:, "country"].apply(rename_country)

del age
del gender
del education
del country

df_cannabis

In [None]:
# Cette cellule sert à voir la quantité de personnes dans chaque feature catégorique 
 
print(df_cannabis["country"].value_counts())
print(df_cannabis["education"].value_counts())
print(df_cannabis["age"].value_counts())
print(df_cannabis['gender'].value_counts())

Pas mal de valeur qui sont très minoritaires, le modèle aura du mal à apprendre sur ça.
On merge les valeurs minoritaires :

In [None]:
# merge Other with Canada Australia and New Zealand, and Republic of Ireland
df_cannabis["country"] = df_cannabis["country"].replace(["Other", "Canada", "Australia", "New Zealand", "Republic of Ireland"], "Other")
print(df_cannabis["country"].value_counts())

df_cannabis["education"] = df_cannabis["education"].replace(["left school at 18", "left school at 16", "left school at 17", "left school before 16"], "left school before 18 (included)")
print(df_cannabis["education"].value_counts())

df_cannabis["age"] = df_cannabis["age"].replace(["45-54", "55-64", "65+"], "45+")
print(df_cannabis["age"].value_counts())


#### On fait le one-hot sur les colonnes *age*, *gender*, *education* et *country* et on combine les nouvelles colonnes avec les autres features

In [None]:
# perform one-hot encoding on categorical features
categorical_features = ['age','gender', 'education', 'country']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df_cannabis[categorical_features])
df_cannabis_encoded = pd.DataFrame(enc.transform(df_cannabis[categorical_features]).toarray(), columns=enc.get_feature_names_out())
# on modifie les index car on avait supprimé certaine lignes donc ce ne sont pas exactement les memes sinon
df_cannabis_encoded.index = df_cannabis.index
# combine the one-hot encoded features with the numerical features
X = pd.concat([df_cannabis.drop(categorical_features, axis=1), df_cannabis_encoded ], axis=1).drop("cannabis", axis=1)
y = df_cannabis['cannabis']

In [None]:
X.columns

#### PCA 2 dimensions pour visualiser la répartition des données

In [None]:
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X)
print(pca.explained_variance_ratio_)

In [None]:
plt.scatter(X_2d[:, 0], X_2d[:, 1], marker='o', c=y, edgecolors='k')
plt.xlabel('first dir')
plt.ylabel('second dir')
plt.show()

#### Séparation des données en train, valid et test

In [None]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.2, random_state=42)

### Petit test avec arbre de décision

In [None]:
clf = DecisionTreeClassifier(max_leaf_nodes=7, random_state=0)
clf.fit(X_train, y_train)
fig, ax = plt.subplots(figsize=(30, 20))
tree.plot_tree(clf, fontsize=20, ax = ax, feature_names=X.columns)
y_pred = clf.predict(X_valid)
print(balanced_accuracy_score(y_valid, y_pred))

In [None]:
score_train = cross_val_score(clf, X_train_valid, y_train_valid, cv=5)
print(score_train.mean(), score_train.std())

# Test de plusieurs hyper-paramètres des arbres de décisions

### Nombre de feuilles

In [None]:
# Teste de l'hyperparamètre : nombre de feuille

nom_param = "nombre de feuilles"
limite_param = 50
scoring = ["balanced_accuracy", "roc_auc"]

means = [np.zeros(limite_param) for i in range(len(scoring) *2)]
stds = [np.zeros(limite_param) for i in range(len(scoring) *2)]

for n in range(limite_param):
    clf = DecisionTreeClassifier(max_leaf_nodes=n+3, random_state=0)
    clf.fit(X_train, y_train)
    result = cross_validate(clf, X_train_valid, y_train_valid, cv=5, n_jobs=-1, scoring = scoring, return_train_score=True)
    for i in range(len(scoring)):
        means[i*2][n] = result["test_" + scoring[i]].mean()
        means[i*2 +1][n] = result["train_" + scoring[i]].mean()
        stds[i*2][n] = result["test_" + scoring[i]].std()
        stds[i*2 +1][n] = result["train_" + scoring[i]].std()

fig, axs = plt.subplots(len(scoring), 1, figsize=(20, 14))

for i in range(len(scoring)):
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2], stds[i*2], label="test")
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2 +1], stds[i*2 +1], label="train")
    axs[i].set_xlabel(nom_param)
    axs[i].set_ylabel(scoring[i])
    axs[i].legend()
    axs[i].grid()
    axs[i].set_xticks(range(0, limite_param+5))

### Profondeur maximale

In [None]:
# Teste de l'hyperparamètre : profondeur max
nom_param = "profondeur max"
limite_param = 10
scoring = ["balanced_accuracy", "roc_auc"]

means = [np.zeros(limite_param) for i in range(len(scoring) *2)]
stds = [np.zeros(limite_param) for i in range(len(scoring) *2)]

for n in range(limite_param):
    clf = DecisionTreeClassifier(max_depth=n+1, random_state=0)
    clf.fit(X_train, y_train)
    result = cross_validate(clf, X_train_valid, y_train_valid, cv=5, n_jobs=-1, scoring = scoring, return_train_score=True)
    for i in range(len(scoring)):
        means[i*2][n] = result["test_" + scoring[i]].mean()
        means[i*2 +1][n] = result["train_" + scoring[i]].mean()
        stds[i*2][n] = result["test_" + scoring[i]].std()
        stds[i*2 +1][n] = result["train_" + scoring[i]].std()

fig, axs = plt.subplots(len(scoring), 1, figsize=(20, 14))

for i in range(len(scoring)):
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2], stds[i*2], label="test")
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2 +1], stds[i*2 +1], label="train")
    axs[i].set_xlabel(nom_param)
    axs[i].set_ylabel(scoring[i])
    axs[i].legend()
    axs[i].grid()
    axs[i].set_xticks(range(0, limite_param+5))

### Nombre de sample minimum nécessaire pour effectuer un nouveau branchement

In [None]:
# Teste de l'hyperparamètre : nombre de sample min pour split
nom_param = "nombre de sample minimum pour split"
limite_param = 250
scoring = ["balanced_accuracy", "roc_auc"]

means = [np.zeros(limite_param) for i in range(len(scoring) *2)]
stds = [np.zeros(limite_param) for i in range(len(scoring) *2)]

for n in range(limite_param):
    clf = DecisionTreeClassifier(min_samples_split=n+2, random_state=0)
    clf.fit(X_train, y_train)
    result = cross_validate(clf, X_train_valid, y_train_valid, cv=5, n_jobs=-1, scoring = scoring, return_train_score=True)
    for i in range(len(scoring)):
        means[i*2][n] = result["test_" + scoring[i]].mean()
        means[i*2 +1][n] = result["train_" + scoring[i]].mean()
        stds[i*2][n] = result["test_" + scoring[i]].std()
        stds[i*2 +1][n] = result["train_" + scoring[i]].std()

fig, axs = plt.subplots(len(scoring), 1, figsize=(20, 14))

for i in range(len(scoring)):
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2], stds[i*2], label="test")
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2 +1], stds[i*2 +1], label="train")
    axs[i].set_xlabel(nom_param)
    axs[i].set_ylabel(scoring[i])
    axs[i].legend()
    axs[i].grid()
    axs[i].set_xticks(range(0, limite_param+5, 5))

### Nombre de sample minimum par feuille

In [None]:
# Teste de l'hyperparamètre : nombre de sample min par feuille
nom_param = "nombre de sample minimum par feuille"
limite_param = 140
scoring = ["balanced_accuracy", "roc_auc"]

means = [np.zeros(limite_param) for i in range(len(scoring) *2)]
stds = [np.zeros(limite_param) for i in range(len(scoring) *2)]

for n in range(limite_param):
    clf = DecisionTreeClassifier(min_samples_leaf=n+1, random_state=0)
    clf.fit(X_train, y_train)
    result = cross_validate(clf, X_train_valid, y_train_valid, cv=5, n_jobs=-1, scoring = scoring, return_train_score=True)
    for i in range(len(scoring)):
        means[i*2][n] = result["test_" + scoring[i]].mean()
        means[i*2 +1][n] = result["train_" + scoring[i]].mean()
        stds[i*2][n] = result["test_" + scoring[i]].std()
        stds[i*2 +1][n] = result["train_" + scoring[i]].std()

fig, axs = plt.subplots(len(scoring), 1, figsize=(20, 14))

for i in range(len(scoring)):
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2], stds[i*2], label="test")
    axs[i].errorbar(np.array(range(limite_param)) + 3, means[i*2 +1], stds[i*2 +1], label="train")
    axs[i].set_xlabel(nom_param)
    axs[i].set_ylabel(scoring[i])
    axs[i].legend()
    axs[i].grid()
    axs[i].set_xticks(range(0, limite_param+5, 5))

## Prediction sur X_test

In [None]:
clf = DecisionTreeClassifier(min_samples_leaf=47, random_state=0)
clf.fit(X_train, y_train)
# fig, ax = plt.subplots(figsize=(30, 20))
# tree.plot_tree(clf, fontsize=20, ax = ax, feature_names=X.columns)
y_pred = clf.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))