In [189]:
import pandas as pd
import numpy as np

# Fase 2:
1. Criar um Jupyter Notebook para iniciar a exploração dos conjuntos de
dados
2. No notebook, calcular medidas de centralidade e dispersão das variáveis
disponíveis
3. No notebook, criar boxplots para as variáveis disponíveis

In [190]:
drugs_df = pd.read_csv('datasets/Drug_Consumption.csv')
drugs_df.head(5)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,AScore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,2,25-34,M,Doctorate degree,UK,White,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
1,3,35-44,M,Professional certificate/ diploma,UK,White,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
2,4,18-24,F,Masters degree,UK,White,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
3,5,35-44,F,Doctorate degree,UK,White,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
4,6,65+,F,Left school at 18 years,Canada,White,-0.67825,-0.30033,-1.55521,2.03972,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0


#### Informações do dataset:
- ID: is a number of records in an original database. Cannot be related to the participant. It can be used for reference only.
- Age (Real) is the age of participant
- Gender: Male or Female
- Education: level of education of participant
- Country: country of origin of the participant
- Ethnicity: ethnicity of participant
- Nscore (Real) is NEO-FFI-R Neuroticism
- Escore (Real) is NEO-FFI-R Extraversion
- Oscore (Real) is NEO-FFI-R Openness to experience.
- Ascore (Real) is NEO-FFI-R Agreeableness.
- Cscore (Real) is NEO-FFI-R Conscientiousness.
- Impulsive (Real) is impulsiveness measured by BIS-11
- SS (Real) is sensation seeing measured by ImpSS
- Alcohol: alcohol consumption
- Amphet: amphetamines consumption
- Amyl: nitrite consumption
- Benzos: benzodiazepine consumption
- Caff: caffeine consumption
- Cannabis: marijuana consumption
- Choc: chocolate consumption
- Coke: cocaine consumption
- Crack: crack cocaine consumption
- Ecstasy: ecstasy consumption
- Heroin: heroin consumption
- Ketamine: ketamine consumption
- Legalh: legal highs consumption
- LSD: LSD consumption
- Meth: methadone consumption
- Mushroom: magic mushroom consumption
- Nicotine: nicotine consumption
- Semer: class of fictitious drug Semeron consumption (i.e. control)
- VSA: class of volatile substance abuse consumption
##### Rating's for Drug Use:

- CL0 Never Used
- CL1 Used over a Decade Ago
- CL2 Used in Last Decade
- CL3 Used in Last Year 59
- CL4 Used in Last Month
- CL5 Used in Last Week
- CL6 Used in Last Day

In [219]:
#função para realizar a analise exploratoria dos dados
def infos(data, info = '', describe = True, means = [], fraction = True):

    #Cria um data frame com a informacao solicitada e sua frequencia
    df = data[info].value_counts().reset_index()
    df.rename(columns = {'index' : info, info : 'Count'}, inplace = True)
    
    if fraction:#Fraction retorna uma coluna no data frame com a porcentagem das frequencias
        fractions = (((df['Count'])*100)/(data[info].describe()['count']))
        df['Fraction(%)'] = round(fractions, 2)
    
    if (means != []):#means retorna o significado de cada linha do dataframe
        a = pd.Series(means)
        df['Meaning'] = a

    if describe: #realiza a função describe do pandas
        print('Describe:')
        print(data[info].describe())
    
    print(df)

In [220]:
infos(drugs_df, info='Age')

Describe:
count      1884
unique        6
top       18-24
freq        643
Name: Age, dtype: object
     Age  Count  Fraction(%)
0  18-24    643        34.13
1  25-34    481        25.53
2  35-44    355        18.84
3  45-54    294        15.61
4  55-64     93         4.94
5    65+     18         0.96


In [193]:
infos(drugs_df, info='Gender')

Describe:
count     1884
unique       2
top          M
freq       943
Name: Gender, dtype: object
  Gender  Count  Fraction(%)
0      M    943        50.05
1      F    941        49.95


In [194]:
infos(drugs_df, info='Country')

Describe:
count     1884
unique       7
top         UK
freq      1043
Name: Country, dtype: object
               Country  Count  Fraction(%)
0                   UK   1043        55.36
1                  USA    557        29.56
2                Other    118         6.26
3               Canada     87         4.62
4            Australia     54         2.87
5  Republic of Ireland     20         1.06
6          New Zealand      5         0.27


In [195]:
infos(drugs_df, info='Ethnicity')

Describe:
count      1884
unique        7
top       White
freq       1720
Name: Ethnicity, dtype: object
           Ethnicity  Count  Fraction(%)
0              White   1720        91.30
1              Other     63         3.34
2              Black     33         1.75
3              Asian     26         1.38
4  Mixed-White/Black     20         1.06
5  Mixed-White/Asian     19         1.01
6  Mixed-Black/Asian      3         0.16


In [196]:
#Criação de dataset auxiliar contendo as informações do uso de diversas drogas
drugs_df_drugs = drugs_df.loc[:, "Alcohol":"Nicotine"]
drugs_df_drugs.head(5)

Unnamed: 0,Alcohol,Amphet,Amyl,Benzos,Caff,Cannabis,Choc,Coke,Crack,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine
0,CL5,CL2,CL2,CL0,CL6,CL4,CL6,CL3,CL0,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4
1,CL6,CL0,CL0,CL0,CL6,CL3,CL4,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0
2,CL4,CL0,CL0,CL3,CL5,CL2,CL4,CL2,CL0,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2
3,CL4,CL1,CL1,CL0,CL6,CL3,CL6,CL0,CL0,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2
4,CL2,CL0,CL0,CL0,CL6,CL0,CL4,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6


In [197]:
infos(drugs_df_drugs, info='Alcohol')

Describe:
count     1884
unique       7
top        CL5
freq       758
Name: Alcohol, dtype: object
  Alcohol  Count  Fraction(%)
0     CL5    758        40.23
1     CL6    505        26.80
2     CL4    287        15.23
3     CL3    198        10.51
4     CL2     68         3.61
5     CL1     34         1.80
6     CL0     34         1.80


In [198]:
infos(drugs_df_drugs, info='Amphet')

Describe:
count     1884
unique       7
top        CL0
freq       976
Name: Amphet, dtype: object
  Amphet  Count  Fraction(%)
0    CL0    976        51.80
1    CL2    242        12.85
2    CL1    230        12.21
3    CL3    198        10.51
4    CL6    102         5.41
5    CL4     75         3.98
6    CL5     61         3.24


In [199]:
infos(drugs_df_drugs, info='Amyl')

Describe:
count     1884
unique       7
top        CL0
freq      1304
Name: Amyl, dtype: object
  Amyl  Count  Fraction(%)
0  CL0   1304        69.21
1  CL2    237        12.58
2  CL1    210        11.15
3  CL3     92         4.88
4  CL4     24         1.27
5  CL5     14         0.74
6  CL6      3         0.16


In [200]:
infos(drugs_df_drugs, info='Benzos')

Describe:
count     1884
unique       7
top        CL0
freq      1000
Name: Benzos, dtype: object
  Benzos  Count  Fraction(%)
0    CL0   1000        53.08
1    CL3    236        12.53
2    CL2    233        12.37
3    CL4    120         6.37
4    CL1    116         6.16
5    CL6     95         5.04
6    CL5     84         4.46


In [201]:
infos(drugs_df_drugs, info='Caff')

Describe:
count     1884
unique       7
top        CL6
freq      1384
Name: Caff, dtype: object
  Caff  Count  Fraction(%)
0  CL6   1384        73.46
1  CL5    273        14.49
2  CL4    106         5.63
3  CL3     60         3.18
4  CL0     27         1.43
5  CL2     24         1.27
6  CL1     10         0.53


In [202]:
infos(drugs_df_drugs, info='Cannabis')

Describe:
count     1884
unique       7
top        CL6
freq       463
Name: Cannabis, dtype: object
  Cannabis  Count  Fraction(%)
0      CL6    463        24.58
1      CL0    412        21.87
2      CL2    266        14.12
3      CL3    211        11.20
4      CL1    207        10.99
5      CL5    185         9.82
6      CL4    140         7.43


In [203]:
infos(drugs_df_drugs, info='Choc')

Describe:
count     1884
unique       7
top        CL6
freq       807
Name: Choc, dtype: object
  Choc  Count  Fraction(%)
0  CL6    807        42.83
1  CL5    682        36.20
2  CL4    296        15.71
3  CL3     54         2.87
4  CL0     32         1.70
5  CL2     10         0.53
6  CL1      3         0.16


In [204]:
infos(drugs_df_drugs, info='Coke')

Describe:
count     1884
unique       7
top        CL0
freq      1037
Name: Coke, dtype: object
  Coke  Count  Fraction(%)
0  CL0   1037        55.04
1  CL2    270        14.33
2  CL3    258        13.69
3  CL1    160         8.49
4  CL4     99         5.25
5  CL5     41         2.18
6  CL6     19         1.01


In [205]:
infos(drugs_df_drugs, info='Crack')

Describe:
count     1884
unique       7
top        CL0
freq      1626
Name: Crack, dtype: object
  Crack  Count  Fraction(%)
0   CL0   1626        86.31
1   CL2    112         5.94
2   CL1     67         3.56
3   CL3     59         3.13
4   CL5      9         0.48
5   CL4      9         0.48
6   CL6      2         0.11


In [206]:
infos(drugs_df_drugs, info='Ecstasy')

Describe:
count     1884
unique       7
top        CL0
freq      1020
Name: Ecstasy, dtype: object
  Ecstasy  Count  Fraction(%)
0     CL0   1020        54.14
1     CL3    277        14.70
2     CL2    234        12.42
3     CL4    156         8.28
4     CL1    113         6.00
5     CL5     63         3.34
6     CL6     21         1.11


In [207]:
infos(drugs_df_drugs, info='Heroin')

Describe:
count     1884
unique       7
top        CL0
freq      1604
Name: Heroin, dtype: object
  Heroin  Count  Fraction(%)
0    CL0   1604        85.14
1    CL2     94         4.99
2    CL1     68         3.61
3    CL3     65         3.45
4    CL4     24         1.27
5    CL5     16         0.85
6    CL6     13         0.69


In [208]:
infos(drugs_df_drugs, info='Ketamine')

Describe:
count     1884
unique       7
top        CL0
freq      1489
Name: Ketamine, dtype: object
  Ketamine  Count  Fraction(%)
0      CL0   1489        79.03
1      CL2    142         7.54
2      CL3    129         6.85
3      CL1     45         2.39
4      CL4     42         2.23
5      CL5     33         1.75
6      CL6      4         0.21


In [209]:
infos(drugs_df_drugs, info='Legalh')

Describe:
count     1884
unique       7
top        CL0
freq      1093
Name: Legalh, dtype: object
  Legalh  Count  Fraction(%)
0    CL0   1093        58.01
1    CL3    323        17.14
2    CL2    198        10.51
3    CL4    110         5.84
4    CL6     67         3.56
5    CL5     64         3.40
6    CL1     29         1.54


In [210]:
infos(drugs_df_drugs, info='LSD')

Describe:
count     1884
unique       7
top        CL0
freq      1068
Name: LSD, dtype: object
   LSD  Count  Fraction(%)
0  CL0   1068        56.69
1  CL1    259        13.75
2  CL3    214        11.36
3  CL2    177         9.39
4  CL4     97         5.15
5  CL5     56         2.97
6  CL6     13         0.69


In [211]:
infos(drugs_df_drugs, info='Meth')

Describe:
count     1884
unique       7
top        CL0
freq      1428
Name: Meth, dtype: object
  Meth  Count  Fraction(%)
0  CL0   1428        75.80
1  CL3    149         7.91
2  CL2     97         5.15
3  CL6     73         3.87
4  CL4     50         2.65
5  CL5     48         2.55
6  CL1     39         2.07


In [212]:
infos(drugs_df_drugs, info='Mushrooms')

Describe:
count     1884
unique       7
top        CL0
freq       981
Name: Mushrooms, dtype: object
  Mushrooms  Count  Fraction(%)
0       CL0    981        52.07
1       CL3    275        14.60
2       CL2    260        13.80
3       CL1    209        11.09
4       CL4    115         6.10
5       CL5     40         2.12
6       CL6      4         0.21


In [213]:
infos(drugs_df_drugs, info='Nicotine')

Describe:
count     1884
unique       7
top        CL6
freq       610
Name: Nicotine, dtype: object
  Nicotine  Count  Fraction(%)
0      CL6    610        32.38
1      CL0    428        22.72
2      CL2    203        10.77
3      CL1    193        10.24
4      CL3    185         9.82
5      CL5    157         8.33
6      CL4    108         5.73


In [214]:
drugs_df_quant = pd.read_csv('datasets/Drug_Consumption_Quantified.csv')
drugs_df_quant

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,AScore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
1,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.62090,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
2,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
3,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.63340,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
4,6,2.59171,0.48246,-1.22751,0.24923,-0.31685,-0.67825,-0.30033,-1.55521,2.03972,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1879,1884,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,-1.19430,1.74091,1.88511,0.76096,...,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1880,1885,-0.95197,-0.48246,-0.61113,-0.57009,-0.31685,-0.24649,1.74091,0.58331,0.76096,...,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1881,1886,-0.07854,0.48246,0.45468,-0.57009,-0.31685,1.13281,-1.37639,-1.27553,-1.77200,...,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1882,1887,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.91093,-1.92173,0.29338,-1.62090,...,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


In [215]:
infos(drugs_df_quant, info='Age')

Describe:
count    1884.000000
mean        0.034364
std         0.878529
min        -0.951970
25%        -0.951970
50%        -0.078540
75%         0.497880
max         2.591710
Name: Age, dtype: float64
       Age  Count  Fraction(%)
0 -0.95197    643        34.13
1 -0.07854    481        25.53
2  0.49788    355        18.84
3  1.09449    294        15.61
4  1.82213     93         4.94
5  2.59171     18         0.96


In [216]:
infos(drugs_df_quant, 'Age', means=['18-24', '25-34', '35-44', '45-54', '55-64','65+'])

Describe:
count    1884.000000
mean        0.034364
std         0.878529
min        -0.951970
25%        -0.951970
50%        -0.078540
75%         0.497880
max         2.591710
Name: Age, dtype: float64
       Age  Count  Fraction(%) Meaning
0 -0.95197    643        34.13   18-24
1 -0.07854    481        25.53   25-34
2  0.49788    355        18.84   35-44
3  1.09449    294        15.61   45-54
4  1.82213     93         4.94   55-64
5  2.59171     18         0.96     65+
