# Imports

In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

# Verification du dataset

In [80]:
features  = [
    'ID', 
    'Age', # 18-24, 25-34, 35-44, 45-54, 55-64, 65+
    'Gender', # Female, Male
    'Education', # Left before age 16, left @ 16, @ 17, @ 18, some college, prof cert, univ degree, masters, doctorate
    'Country', # Country: AUS, CAN, NZ, Other, IRE, UK, USA
    'Ethnicity', # Ethnicity: Asian, Black, Mixed Bla/As, Mixed Whi/As, Mixed Whi/Bla, Other
    'Neuroticism',
    'Extraversion',
    'Openness to experience',
    'Agreeableness',
    'Conscientiousness',
    'Impulsiveness',
    'Sensation seeking',
    'Alcohol',
    'Amphetamines',
    'Amyl nitrite',
    'Benzodiazepine',
    'Caffeine',
    'Cannabis',
    'Chocolate',
    'Cocaine',
    'Crack',
    'Ecstasy',
    'Heroin',
    'Ketamine',
    'Legal highs',
    'Lysergic acid diethylamide',
    'Methadone',
    'Magic mushrooms',
    'Nicotine',
    'Fictitious drug Semeron',
    'Volatile substance abuse'
]

In [81]:
data = pd.read_csv('drug_consumption.data', header=None, names=features)
print("{} instances with {} features\n".format(*data.shape))
data.head()

1885 instances with 32 features



Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Neuroticism,Extraversion,Openness to experience,Agreeableness,...,Ecstasy,Heroin,Ketamine,Legal highs,Lysergic acid diethylamide,Methadone,Magic mushrooms,Nicotine,Fictitious drug Semeron,Volatile substance abuse
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [82]:
print(data.describe())

                ID         Age       Gender    Education      Country  \
count  1885.000000  1885.00000  1885.000000  1885.000000  1885.000000   
mean    945.294960     0.03461    -0.000256    -0.003806     0.355542   
std     545.167641     0.87836     0.482588     0.950078     0.700335   
min       1.000000    -0.95197    -0.482460    -2.435910    -0.570090   
25%     474.000000    -0.95197    -0.482460    -0.611130    -0.570090   
50%     946.000000    -0.07854    -0.482460    -0.059210     0.960820   
75%    1417.000000     0.49788     0.482460     0.454680     0.960820   
max    1888.000000     2.59171     0.482460     1.984370     0.960820   

         Ethnicity  Neuroticism  Extraversion  Openness to experience  \
count  1885.000000  1885.000000   1885.000000             1885.000000   
mean     -0.309577     0.000047     -0.000163               -0.000534   
std       0.166226     0.998106      0.997448                0.996229   
min      -1.107020    -3.464360     -3.273930     

Les frequences d'utilisation des differentes drogues ne sont pas inclus car elles sont toujours typé en texte et non en valeur numérique.
On constate également que nous n'avons pas de valeurs manquantes pour les attributs personnels des individus. Nous avons bien 1885 individus pour chaque attributs.
Nous allons tout de même verifier pour les frequences d'utilisations de drogues.

In [83]:
data.isna().sum()

ID                            0
Age                           0
Gender                        0
Education                     0
Country                       0
Ethnicity                     0
Neuroticism                   0
Extraversion                  0
Openness to experience        0
Agreeableness                 0
Conscientiousness             0
Impulsiveness                 0
Sensation seeking             0
Alcohol                       0
Amphetamines                  0
Amyl nitrite                  0
Benzodiazepine                0
Caffeine                      0
Cannabis                      0
Chocolate                     0
Cocaine                       0
Crack                         0
Ecstasy                       0
Heroin                        0
Ketamine                      0
Legal highs                   0
Lysergic acid diethylamide    0
Methadone                     0
Magic mushrooms               0
Nicotine                      0
Fictitious drug Semeron       0
Volatile

Nous n'avons donc pas de valeurs manquantes.

Les données que nous possédons sur la personalité de chaque individu ont déjà été centré réduite et sont plus au moins comprise entre -4 et 4. Elles ne sont pas très parlante. Par conséquent, nous allons convertir ces valeurs numériques en valeurs nominales.

# Conversion des données

In [84]:
data_copy = data.copy() # Nous allons faire des modifications sur les données, donc nous faisons une copie au préalable

In [85]:
data['Age']=data['Age'].astype('str')
data['Gender']=data['Gender'].astype('str')
data['Education']=data['Education'].astype('str')
data['Country']=data['Country'].astype('str')
data['Ethnicity']=data['Ethnicity'].astype('str')

data['Age'].value_counts()
data['Gender'].value_counts()
data['Education'].value_counts()
data['Country'].value_counts()
data['Ethnicity'].value_counts()

-0.31685               1720
0.1144                   63
-1.1070200000000001      33
-0.50212                 26
0.126                    20
-0.22166                 20
1.90725                   3
Name: Ethnicity, dtype: int64

In [86]:
data['Age'].replace({
    '-0.9519700000000001' :'18-24',
    '-0.07854':'25-34',
    '0.49788000000000004':'35-44',
    '1.09449':'45-54',
    '1.82213':'55-64',
    '2.59171':'65+'}, 
    inplace=True)

data['Gender'].replace({
    '-0.48246000000000006' :'Female',
    '0.48246000000000006':'Male'}, 
    inplace=True)

data['Education'].replace({
    '-0.059210000000000006':'Certificate Diploma',
    '1.98437':'Doctorate',
    '1.16365':'Masters',
    '-1.22751':'Left School at 18',
    '-1.7379':'Left School at 16',
    '0.45468000000000003':'University Degree',
    '-0.6111300000000001':'Some College',
    '-2.43591':'Left School Before 16',
    '-1.43719':'Left School at 17'},
    inplace=True)

data['Country'].replace({
    '0.9608200000000001' :'UK',
    '-0.57009':'USA',
    '-0.28519':'Other',
    '0.24923':'Canada',
    '-0.09765 ':'Australia',
    '0.21128000000000002':'Ireland',
    '-0.46841000000000005':'New Zealand'}, 
    inplace=True)

data['Ethnicity'].replace({
    '-0.31685' :'White',
    '0.1144':'Other ',
    '-1.1070200000000001':'Black',
    '-0.50212 ':'Asian',
    '0.126':'Mixed-White/Black',
    '-0.22166':'Mixed-White/Asian',
    '1.90725':'Mixed-Black/Asian'}, 
    inplace=True)


In [87]:
drugs = [
    'Alcohol',
    'Amphetamines',
    'Amyl nitrite',
    'Benzodiazepine',
    'Caffeine',
    'Cannabis',
    'Chocolate',
    'Cocaine',
    'Crack',
    'Ecstasy',
    'Heroin',
    'Ketamine',
    'Legal highs',
    'Lysergic acid diethylamide',
    'Methadone',
    'Magic mushrooms',
    'Nicotine',
    'Fictitious drug Semeron',
    'Volatile substance abuse'
]

for i in drugs:
    data[i] = data[i].map({'CL0': 0, 'CL1': 1, 'CL2': 2, 'CL3': 3, 'CL4': 4, 'CL5': 5, 'CL6': 6})
    


data.head(10)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Neuroticism,Extraversion,Openness to experience,Agreeableness,...,Ecstasy,Heroin,Ketamine,Legal highs,Lysergic acid diethylamide,Methadone,Magic mushrooms,Nicotine,Fictitious drug Semeron,Volatile substance abuse
0,1,35-44,Male,Certificate Diploma,UK,Mixed-White/Black,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,2,0,0
1,2,25-34,Female,Doctorate,UK,White,-0.67825,1.93886,1.43533,0.76096,...,4,0,2,0,2,3,0,4,0,0
2,3,35-44,Female,Certificate Diploma,UK,White,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,1,0,0,0
3,4,18-24,Male,Masters,UK,White,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,2,0,0,0,0,2,0,0
4,5,35-44,Male,Doctorate,UK,White,0.73545,-1.6334,-0.45174,-0.30172,...,1,0,0,1,0,0,2,2,0,0
5,6,65+,Male,Left School at 18,Canada,White,-0.67825,-0.30033,-1.55521,2.03972,...,0,0,0,0,0,0,0,6,0,0
6,7,45-54,Female,Masters,USA,White,-0.46725,-1.09207,-0.45174,-0.30172,...,0,0,0,0,0,0,0,6,0,0
7,8,35-44,Female,Left School at 16,UK,White,-1.32828,1.93886,-0.84732,-0.30172,...,0,0,0,0,0,0,0,0,0,0
8,9,35-44,Male,Certificate Diploma,Canada,White,0.62967,2.57309,-0.97631,0.76096,...,0,0,0,0,0,0,0,6,0,0
9,10,55-64,Female,Masters,UK,White,-0.24649,0.00332,-1.42424,0.59042,...,0,0,0,0,0,0,0,6,0,0


Nous avons maintenant un dataset propre avec des valeurs pertinentes.