In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
header = ["id", "age", "gender", "education", "country", "ethnicity", "neuroticism", "extraversion", \
          "openness", "agreeableness", "conscientiousness", "impulsiveness", "sensation_seeking", \
          "alcohol", "amphet", "amyl", "benzos", "caff", "cannabis", "choc", "coke", "crack", "ecstasy", \
          "heroin", "ketamine", "legallh", "lsd", "meth", "mushroom", "nicotine", "semer", "vsa"]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data", names=header)
df

## Preprocessing

Semer est une drogue fictive ajouté dans le questionnaire par les chercheurs.  
Les personnes ayant répondu qu'ils en avaient consommé ont donc menti on a donc décidé de supprimé ces personnes du dataset.

In [None]:
semer_consummer_index = df.loc[df["semer"] != "CL0"].index

print("Nombre de personnes ayant indiqué avoir consommé la drogue fictive : ", semer_consummer_index.size)

df.drop(index = semer_consummer_index, inplace = True)

In [None]:
df.describe()

Le dataset a toutes ses informations modifié par un Standard Scaler. Or, certaines features sont catégorique et pour la visualisation, avoir les vrais valeurs est plus intéressants.

In [None]:
age = {-0.95197:"18-24", -0.07854:"25-34", 0.49788:"35-44", 1.09449:"45-54", 1.82213:"55-64", 2.59171:"65+"}
gender = {-0.48246:"male", 0.48246:"female"}
education = {-2.43591:"left school before 16", -1.7379:"left school at 16", -1.43719:"left school at 17", -1.22751:"left school at 18", -0.61113:"left college without degree", -0.05921:"professionnal certificate", 0.45468:"university degree", 1.16365:"master degree", 1.98437:"doctorate degree"}
country = {-0.57009:"USA", -0.46841:"New Zealand", -0.28519:"Other", -0.09765:"Australia", 0.21128:"Republic of Ireland", 0.24923:"Canada", 0.96082:"UK"}
ethnicity = {-1.10702: "Black", -0.50212: "Asian", -0.31685: "White", -0.22166: "Mixed-White/Black", 0.11440: "Other", 0.12600: "Mixed-White/Asian", 1.90725: "Mixed-Black/Asian"}
neuroticism = {-3.464360:12, -3.157350:13, -2.756960:14, -2.521970:15, -2.423170:16, -2.343600:17, -2.218440:18, -2.050480:19, -1.869620:20, -1.691630:21, -1.550780:22, -1.439070:23, -1.328280:24, -1.194300:25, -1.053080:26, -0.921040:27, -0.791510:28, -0.678250:29, -0.580160:30, -0.467250:31, -0.347990:32, -0.246490:33, -0.148820:34, -0.051880:35, 0.042570:36, 0.136060:37, 0.223930:38, 0.312870:39, 0.416670:40, 0.521350:41, 0.629670:42, 0.735450:43, 0.825620:44, 0.910930:45, 1.021190:46, 1.132810:47, 1.234610:48, 1.372970:49, 1.491580:50, 1.603830:51, 1.720120:52, 1.839900:53, 1.984370:54, 2.127000:55, 2.285540:56, 2.462620:57, 2.611390:58, 2.821960:59, 3.273930:60}
extraversion = {-3.273930:16, -3.005370:18, -2.728270:19, -2.538300:20, -2.449040:21, -2.323380:22, -2.210690:23, -2.114370:24, -2.039720:25, -1.921730:26, -1.762500:27, -1.633400:28, -1.507960:29, -1.376390:30, -1.231770:31, -1.092070:32, -0.947790:33, -0.806150:34, -0.695090:35, -0.575450:36, -0.439990:37, -0.300330:38, -0.154870:39, 0.003320:40, 0.167670:41, 0.321970:42, 0.476170:43, 0.637790:44, 0.805230:45, 0.962480:46, 1.114060:47, 1.286100:48, 1.454210:49, 1.584870:50, 1.740910:51, 1.938860:52, 2.127000:53, 2.323380:54, 2.573090:55, 2.859500:56, 3.005370:58, 3.273930:59}
openness = {-3.273930:24, -2.859500:26, -2.631990:28, -2.398830:29, -2.210690:30, -2.090150:31, -1.974950:32, -1.829190:33, -1.680620:34, -1.555210:35, -1.424240:36, -1.275530:37, -1.119020:38, -0.976310:39, -0.847320:40, -0.717270:41, -0.583310:42, -0.451740:43, -0.317760:44, -0.177790:45, -0.019280:46, 0.141430:47, 0.293380:48, 0.445850:49, 0.583310:50, 0.723300:51, 0.883090:52, 1.062380:53, 1.240330:54, 1.435330:55, 1.656530:56, 1.885110:57, 2.153240:58, 2.449040:59, 2.901610:60}
agreeableness = {-3.464360:12, -3.157350:16, -3.005370:18, -2.901610:23, -2.787930:24, -2.701720:25, -2.538300:26, -2.354130:27, -2.218440:28, -2.078480:29, -1.925950:30, -1.772000:31, -1.620900:32, -1.479550:33, -1.342890:34, -1.212130:35, -1.075330:36, -0.916990:37, -0.760960:38, -0.606330:39, -0.453210:40, -0.301720:41, -0.154870:42, -0.017290:43, 0.131360:44, 0.287830:45, 0.438520:46, 0.590420:47, 0.760960:48, 0.941560:49, 1.114060:50, 1.286100:51, 1.450390:52, 1.611080:53, 1.818660:54, 2.039720:55, 2.234270:56, 2.462620:57, 2.756960:58, 3.157350:59, 3.464360:60}
conscientiousness  = {-3.464360:17, -3.157350:19, -2.901610:20, -2.728270:21, -2.573090:22, -2.423170:23, -2.304080:24, -2.181090:25, -2.045060:26, -1.921730:27, -1.781690:28, -1.641010:29, -1.518400:30, -1.385020:31, -1.257730:32, -1.137880:33, -1.014500:34, -0.898910:35, -0.781550:36, -0.652530:37, -0.527450:38, -0.405810:39, -0.276070:40, -0.142770:41, -0.006650:42, 0.123310:43, 0.259530:44, 0.415940:45, 0.584890:46, 0.758300:47, 0.939490:48, 1.134070:49, 1.306120:50, 1.461910:51, 1.630880:52, 1.811750:53, 2.045060:54, 2.333370:55, 2.631990:56, 3.005370:57, 3.464360:59}

features = {'age':age, 'gender':gender, 'education':education, 'country':country, 'ethnicity':ethnicity, 
            'neuroticism':neuroticism, 'extraversion':extraversion, 'openness':openness, 
            'agreeableness':agreeableness, 'conscientiousness':conscientiousness,}


On a choisi de garder uniquement le canabis comme objectif. \
Aussi, on n'utilisera pas ethnicity pour raison éthique.

In [None]:
df_cannabis = df.loc[:,["age", "gender", "education", "country", "neuroticism", "extraversion", \
          "openness", "agreeableness", "conscientiousness", "impulsiveness", "sensation_seeking", \
          "cannabis"]]
df_cannabis

#### On change le type de la colonne cannabis  
-1 signifie que la personne n'est pas consommatrice, 1 signifie qu'elle l'est.
Les valeurs CLx indique quand a été la dernière prise de cette drogue par la personne.
Nous avons choisi de fixer le seuil à partir duquel on considère que la personne n'est plus consommatrice à 1 an.
Si elle n'a pas consommé depuis 1 an alors cette personne n'est pas consommatrice.

In [None]:
def CL_to_binary_class(cl):
    if(cl=="CL0" or cl=="CL1" or cl=="CL2"):
        return -1
    else:
        return 1

In [None]:
cannabis_class = df_cannabis["cannabis"].tolist()

df_cannabis.loc[:,"cannabis"] = list(map(CL_to_binary_class, cannabis_class))
df_cannabis['cannabis'].value_counts()

In [None]:
for col in df_cannabis.columns[:-1]:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15, 4))
    
    fig.suptitle("répartition de la colonne : %s" % col)
    
    
    df_count = pd.DataFrame()
    df_count.loc[:,"index"] = df_cannabis.loc[:,col].drop_duplicates().sort_values()
    df_count = df_count.set_index("index")
    df_count = df_count.merge(df_cannabis.loc[df_cannabis["cannabis"] == 1, col].value_counts().sort_index(),how="outer", left_index=True, right_index=True)
    df_count.rename({df_count.columns[-1]:"consommateur"}, axis=1, inplace=True)
    df_count = df_count.merge(df_cannabis.loc[df_cannabis["cannabis"] == -1, col].value_counts().sort_index(), how="outer", left_index=True, right_index=True)
    df_count.rename({df_count.columns[-1]:"non consommateur"}, axis=1, inplace=True)
    df_count = df_count.merge(df_cannabis.loc[:, col].value_counts(sort=False, normalize=False, dropna=False).sort_index(), how="outer", left_index=True, right_index=True)
    df_count.rename({df_count.columns[-1]:"total"}, axis=1, inplace=True)
    df_count.fillna(0, inplace=True)
    df_count.sort_index(inplace=True)
    
    try:
        X = features[col].values()
    except:
        X = df_count.index.tolist()
        
    Y1 = df_count.loc[:,"total"]
    Y2 = df_count.loc[:,"consommateur"]
    Y3 = df_count.loc[:,"non consommateur"]
    
    ax1.bar(X, Y1, label=X)
    ax1.set_title("tout le monde")
    ax1.set_ylabel("nombre de personnes")
    
    ax2.bar(X, Y2, label=X)
    ax2.set_title("consommateur de cannabis")
    
    ax3.bar(X, Y3, label=X)
    ax3.set_title("non consommateur de cannabis")
    
    plt.draw()
    ax1.set_xticks(ax1.get_xticks(), ax1.get_xticklabels(), rotation=45, ha='right')
    ax2.set_xticks(ax2.get_xticks(), ax2.get_xticklabels(), rotation=45, ha='right')
    ax3.set_xticks(ax3.get_xticks(), ax3.get_xticklabels(), rotation=45, ha='right')
    

In [None]:
for col in df_cannabis.columns[:-1]:
    
    df_percent = pd.DataFrame()
    df_percent.loc[:,"index"] = df_cannabis.loc[:,col].drop_duplicates().sort_values()
    df_percent = df_percent.set_index("index")
    
    df_percent = df_percent.merge(df_cannabis.loc[df_cannabis["cannabis"] == 1, col].value_counts().sort_index(),how="outer", left_index=True, right_index=True)
    df_percent.rename({df_percent.columns[-1]:"consommateur"}, axis=1, inplace=True)
    
    df_percent = df_percent.merge(df_cannabis.loc[df_cannabis["cannabis"] == -1, col].value_counts().sort_index(), how="outer", left_index=True, right_index=True)
    df_percent.rename({df_percent.columns[-1]:"non consommateur"}, axis=1, inplace=True)
    
    df_percent = df_percent.merge(df_cannabis.loc[:, col].value_counts(sort=False, normalize=False, dropna=False).sort_index(), how="outer", left_index=True, right_index=True)
    df_percent.rename({df_percent.columns[-1]:"total"}, axis=1, inplace=True)
    
    df_percent.fillna(0, inplace=True)
    df_percent.loc[:,"consommateur"] = df_percent.loc[:,"consommateur"] / df_percent.loc[:,"total"]*100
    df_percent.loc[:,"non consommateur"] = df_percent.loc[:,"non consommateur"] / df_percent.loc[:,"total"]*100
    
    try:
        X = features[col].values()
    except:
        X = df_percent.index.tolist()
    Y1 = df_percent.loc[:,"consommateur"]
    Y2 = df_percent.loc[:,"non consommateur"]

    fig, ax = plt.subplots(figsize=(15,6))
    fig.suptitle("Pourcentage de consommateurs de cannabis en fonction de l'attribut : %s" % col)
    
    bottom = np.zeros(len(X))
    width = 1
    ax.bar(X, Y1, width, bottom = bottom, label="consommateurs", color="green")
    bottom += Y1
    ax.bar(X, Y2, width, bottom = bottom, label="non consommateurs", color="lightgray")
    ax.set_ylabel("Pourcentage")
    plt.draw()
    ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha='right')
    ax.legend(loc="upper right")

plt.show()