In [150]:
import numpy as np
import pandas as pd
import glob
import os
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Merge historical data

data can be found on [ICPSR website](https://www.icpsr.umich.edu/web/ICPSR/series/64/studies)

In [249]:
%%time

dfs = []
files = glob.glob('../../data/*.tsv')
fields = ['NEWRACE', 'AGE', 'IRSEX', 'HERAGE', 'LSDAGE', 'PCPAGE',
          'CRKAGE', 'ECSAGE', 'COCAGE', 'METHAGE', 'CIGAGE', 'SNUFTRY',
          'CHEWTRY', 'MTHAAGE', 'OXYCAGE', 'CIGTRY ', 'SEDAGE', 'STIMAGE',
          'TRANAGE', 'CIGARTRY', 'INHAGE', 'MJAGE', 'ANALAGE', 'BLNTAGE', 'ALCTRY', 'USEACM'] 
imputed_fields = ['IRCIGAGE', 'IRCDUAGE', 'IRCGRAGE', 'IRSLTAGE', 'IRCHWAGE', 'IRSNFAGE', 'IRALCAGE', 'IRMJAGE', 'IRCOCAGE', 'IRCRKAGE', 'IRHERAGE', 'IRHALAGE', 'IRLSDAGE', 'IRPCPAGE', 'IRECSAGE', 'IRINHAGE', 'IRANLAGE', 'IROXYAGE', 'IRTRNAGE', 'IRSTMAGE', 'IRMTHAGE', 'IRSEDAGE']
# fields += imputed_fields

for file in files:
    df_n1 = pd.read_csv(file, sep='\t', skipinitialspace=True, nrows=1)
    current_columns = []
    for field in fields:
        field = field.strip()
        if field in df_n1.columns:
            current_columns.append(field)
        elif f'{field}2' in df_n1.columns:
            current_columns.append(f'{field}2')
        else:
            print(f'field {field} not in {file}')
    current_columns += ['CIGTRY']
    df = pd.read_csv(file, sep='\t', skipinitialspace=True, usecols=current_columns)
    df['file_name'] = os.path.basename(file)
    dfs.append(df)

field MTHAAGE not in ../../data/04596-0001-Data.tsv
field MTHAAGE not in ../../data/04373-0001-Data.tsv
field BLNTAGE not in ../../data/04373-0001-Data.tsv
field MTHAAGE not in ../../data/21240-0001-Data.tsv
CPU times: user 17 s, sys: 676 ms, total: 17.7 s
Wall time: 17.8 s


In [250]:
main_df = pd.concat(dfs)
# main_df_backlog = main_df.copy()
main_df.shape  # data diff 613774 (real) - 615701 (paper) = -1927 difference

(613974, 27)

In [385]:
%%time
np.random.seed(42)
main_df = pd.concat(dfs)
file_to_year_mapping = {f:2004+i for i, f in enumerate(sorted([os.path.basename(f) for f in files]))}
class_mapping = {
    'MJAGE': 'MARIJUANA',
    'ALCTRY': 'ALCOHOL',
    'CIGAGE': 'CIGARETTES',
    'CIGTRY': 'CIGARETTES',
    'NOUSAGE': 'NO_DRUG_USE',
    'CIGARTRY': 'OTHER_TABACCO',
    'SNUFTRY': 'OTHER_TABACCO',
    'CHEWTRY': 'OTHER_TABACCO',
    'HERAGE': 'OTHER_DRUGS',
    'LSDAGE': 'OTHER_DRUGS',
    'PCPAGE': 'OTHER_DRUGS',
    'CRKAGE': 'OTHER_DRUGS',
    'ECSAGE': 'OTHER_DRUGS',
    'COCAGE': 'OTHER_DRUGS',
    'METHAGE': 'OTHER_DRUGS',
    'MTHAAGE': 'OTHER_DRUGS',
    'OXYCAGE': 'OTHER_DRUGS',
    'SEDAGE': 'OTHER_DRUGS',
    'STIMAGE': 'OTHER_DRUGS',
    'TRANAGE': 'OTHER_DRUGS',
    'INHAGE': 'OTHER_DRUGS',
    'ANALAGE': 'OTHER_DRUGS',
    'BLNTAGE': 'OTHER_DRUGS'
}
age_to_group_mapping = {
    12: '12-13',
    13: '12-13',
    14: '14-15',
    15: '14-15',
    16: '16-17',
    17: '16-17',
    18: '18-19',
    19: '18-19',
    20: '20-21',
    21: '20-21'
}
race_mapping = {1: 'White', 2: 'Black', 3: 'AI/AN', 4: 'NHOPI', 5: 'Asian', 6: 'Multi-racial', 7: 'Hispanic'}
use_acm_map = {
    1: 'ALCTRY',
    2: 'CIGTRY',
    3: 'MJAGE',
    4: 'ALCTRY',
    5: 'CIGTRY',
    6: 'MJAGE',
#     11: 'ALCTRY',
#     12: 'CIGTRY',
#     13: 'MJAGE',
    91: 'NOUSAGE',
}
classes = ['MJAGE', 'CIGAGE', 'CIGTRY', 'ALCTRY', 'CIGARTRY', 'SNUFTRY',
           'CHEWTRY', 'HERAGE', 'LSDAGE', 'PCPAGE', 'CRKAGE',
           'ECSAGE', 'COCAGE', 'METHAGE', 'MTHAAGE', 'OXYCAGE',
           'SEDAGE', 'STIMAGE', 'TRANAGE', 'INHAGE', 'ANALAGE', 'BLNTAGE']
main_df = main_df[(main_df['AGE2'] < 11)]  # filter people < 22 yo
main_df[['MTHAAGE', 'BLNTAGE']] = main_df[['MTHAAGE', 'BLNTAGE']].fillna(10e5)  # fill in nan
main_df['MINAGE'] = main_df[classes].values.min(axis=1)
main_df['MINAGE'] = np.where(main_df['MINAGE'] > 900, 999, main_df['MINAGE'])
main_df['MINAGE_CLASS'] = np.where(main_df['MINAGE'] > 900, 'NOUSAGE', None)
main_df['CLASSES_LIST'] = np.where(main_df['MINAGE'] > 900, 'NOUSAGE', None)
main_df['YEAR'] = main_df['file_name'].map(file_to_year_mapping) # infer year
main_df['SEX'] = main_df['IRSEX'].map({1: 'Male', 2: 'Female'})
main_df['AGE'] = main_df['AGE2'].map({i:i+11 for i in range(1, 11)})
main_df['RACE'] = main_df['NEWRACE2'].map(race_mapping)
main_df.reset_index(inplace=True, drop=True)
main_df.head()

CPU times: user 110 ms, sys: 36 ms, total: 146 ms
Wall time: 145 ms


Unnamed: 0,CIGTRY,CIGAGE,SNUFTRY,CHEWTRY,CIGARTRY,ALCTRY,MJAGE,COCAGE,CRKAGE,HERAGE,LSDAGE,PCPAGE,ECSAGE,INHAGE,ANALAGE,OXYCAGE,TRANAGE,STIMAGE,METHAGE,SEDAGE,MTHAAGE,BLNTAGE,USEACM,AGE2,IRSEX,NEWRACE2,file_name,MINAGE,MINAGE_CLASS,CLASSES_LIST,YEAR,SEX,AGE,RACE
0,16,999,991,991,991,16,16,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,2,9,1,2,32722-0001-Data.tsv,16.0,,,2010,Male,20,Black
1,991,991,991,991,991,14,14,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,4,2,7,32722-0001-Data.tsv,14.0,,,2010,Female,15,Hispanic
2,14,999,991,991,991,13,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,7,2,1,32722-0001-Data.tsv,13.0,,,2010,Female,18,White
3,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,91,2,1,7,32722-0001-Data.tsv,999.0,NOUSAGE,NOUSAGE,2010,Male,13,Hispanic
4,15,999,991,991,15,16,16,18,991,991,991,991,991,991,18,18,991,991,991,991,991.0,17.0,99,9,1,7,32722-0001-Data.tsv,15.0,,,2010,Male,20,Hispanic


In [386]:
%%time
for i, row in main_df.iterrows():
#     if i > 2000:
#         break
    if row['MINAGE'] > 900:
        continue
    several_substances = sorted(row[classes][row[classes].apply(lambda x: x==row['MINAGE'])].index.values)
    main_df.at[i, 'CLASSES_LIST'] = '/'.join(several_substances)
    if len(several_substances) == 1:
        main_df.at[i, 'MINAGE_CLASS'] = several_substances[0]  
    else:
        main_df.at[i, 'MINAGE_CLASS'] = use_acm_map.get(row['USEACM']) # or np.random.choice(several_substances)
main_df.head()

CPU times: user 1min 39s, sys: 132 ms, total: 1min 40s
Wall time: 1min 40s


Unnamed: 0,CIGTRY,CIGAGE,SNUFTRY,CHEWTRY,CIGARTRY,ALCTRY,MJAGE,COCAGE,CRKAGE,HERAGE,LSDAGE,PCPAGE,ECSAGE,INHAGE,ANALAGE,OXYCAGE,TRANAGE,STIMAGE,METHAGE,SEDAGE,MTHAAGE,BLNTAGE,USEACM,AGE2,IRSEX,NEWRACE2,file_name,MINAGE,MINAGE_CLASS,CLASSES_LIST,YEAR,SEX,AGE,RACE
0,16,999,991,991,991,16,16,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,2,9,1,2,32722-0001-Data.tsv,16.0,CIGTRY,ALCTRY/CIGTRY/MJAGE,2010,Male,20,Black
1,991,991,991,991,991,14,14,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,4,2,7,32722-0001-Data.tsv,14.0,,ALCTRY/MJAGE,2010,Female,15,Hispanic
2,14,999,991,991,991,13,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,7,2,1,32722-0001-Data.tsv,13.0,ALCTRY,ALCTRY,2010,Female,18,White
3,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,91,2,1,7,32722-0001-Data.tsv,999.0,NOUSAGE,NOUSAGE,2010,Male,13,Hispanic
4,15,999,991,991,15,16,16,18,991,991,991,991,991,991,18,18,991,991,991,991,991.0,17.0,99,9,1,7,32722-0001-Data.tsv,15.0,,CIGARTRY/CIGTRY,2010,Male,20,Hispanic


In [387]:
class_mapping = {
    'MJAGE': 'MARIJUANA',
    'ALCTRY': 'ALCOHOL',
    'CIGAGE': 'CIGARETTES',
    'CIGTRY': 'CIGARETTES',
    'NOUSAGE': 'NO_DRUG_USE',
    'CIGARTRY': 'OTHER_TABACCO',
    'SNUFTRY': 'OTHER_TABACCO',
    'CHEWTRY': 'OTHER_TABACCO',
    'HERAGE': 'OTHER_DRUGS',
    'LSDAGE': 'OTHER_DRUGS',
    'PCPAGE': 'OTHER_DRUGS',
    'CRKAGE': 'OTHER_DRUGS',
    'ECSAGE': 'OTHER_DRUGS',
    'COCAGE': 'OTHER_DRUGS',
    'METHAGE': 'OTHER_DRUGS',
    'MTHAAGE': 'OTHER_DRUGS',
    'OXYCAGE': 'OTHER_DRUGS',
    'SEDAGE': 'OTHER_DRUGS',
    'STIMAGE': 'OTHER_DRUGS',
    'TRANAGE': 'OTHER_DRUGS',
    'INHAGE': 'OTHER_DRUGS',
    'ANALAGE': 'OTHER_DRUGS',
    'BLNTAGE': 'OTHER_DRUGS'
}
main_df['CLASS'] = main_df['MINAGE_CLASS'].map(class_mapping)
main_df['AGE_GROUP'] = main_df['AGE'].map(age_to_group_mapping)
main_df.reset_index(inplace=True, drop=True)
# main_df.to_csv('fairman19marijuana_dataframe.csv')

In [388]:
main_df.head()

Unnamed: 0,CIGTRY,CIGAGE,SNUFTRY,CHEWTRY,CIGARTRY,ALCTRY,MJAGE,COCAGE,CRKAGE,HERAGE,LSDAGE,PCPAGE,ECSAGE,INHAGE,ANALAGE,OXYCAGE,TRANAGE,STIMAGE,METHAGE,SEDAGE,MTHAAGE,BLNTAGE,USEACM,AGE2,IRSEX,NEWRACE2,file_name,MINAGE,MINAGE_CLASS,CLASSES_LIST,YEAR,SEX,AGE,RACE,CLASS,AGE_GROUP
0,16,999,991,991,991,16,16,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,2,9,1,2,32722-0001-Data.tsv,16.0,CIGTRY,ALCTRY/CIGTRY/MJAGE,2010,Male,20,Black,CIGARETTES,20-21
1,991,991,991,991,991,14,14,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,4,2,7,32722-0001-Data.tsv,14.0,,ALCTRY/MJAGE,2010,Female,15,Hispanic,,14-15
2,14,999,991,991,991,13,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,7,2,1,32722-0001-Data.tsv,13.0,ALCTRY,ALCTRY,2010,Female,18,White,ALCOHOL,18-19
3,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,91,2,1,7,32722-0001-Data.tsv,999.0,NOUSAGE,NOUSAGE,2010,Male,13,Hispanic,NO_DRUG_USE,12-13
4,15,999,991,991,15,16,16,18,991,991,991,991,991,991,18,18,991,991,991,991,991.0,17.0,99,9,1,7,32722-0001-Data.tsv,15.0,,CIGARTRY/CIGTRY,2010,Male,20,Hispanic,,20-21


In [390]:
# 275,559
df = main_df[['YEAR', 'CLASS', 'SEX', 'RACE', 'AGE_GROUP', 'AGE', 'MINAGE']]
df['SEX'] = df['SEX'].astype('category')
df['RACE'] = df['RACE'].astype('category')
df['AGE_GROUP'] = df['AGE_GROUP'].astype('category')
df['CLASS'] = df['CLASS'].astype('category')
df['YEAR'] = df['YEAR'].astype('category')
df['AGE'] = df['AGE'].astype(np.int32)
df['MINAGE'] = df['MINAGE'].astype(np.int32)
df.shape

(296467, 7)

In [392]:
main_df[main_df.CLASS.isna()].head()

Unnamed: 0,CIGTRY,CIGAGE,SNUFTRY,CHEWTRY,CIGARTRY,ALCTRY,MJAGE,COCAGE,CRKAGE,HERAGE,LSDAGE,PCPAGE,ECSAGE,INHAGE,ANALAGE,OXYCAGE,TRANAGE,STIMAGE,METHAGE,SEDAGE,MTHAAGE,BLNTAGE,USEACM,AGE2,IRSEX,NEWRACE2,file_name,MINAGE,MINAGE_CLASS,CLASSES_LIST,YEAR,SEX,AGE,RACE,CLASS,AGE_GROUP
1,991,991,991,991,991,14,14,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,99,4,2,7,32722-0001-Data.tsv,14.0,,ALCTRY/MJAGE,2010,Female,15,Hispanic,,14-15
4,15,999,991,991,15,16,16,18,991,991,991,991,991,991,18,18,991,991,991,991,991.0,17.0,99,9,1,7,32722-0001-Data.tsv,15.0,,CIGARTRY/CIGTRY,2010,Male,20,Hispanic,,20-21
5,991,991,991,991,17,17,18,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,18.0,99,9,1,1,32722-0001-Data.tsv,17.0,,ALCTRY/CIGARTRY,2010,Male,20,White,,20-21
14,15,15,16,17,16,15,18,991,991,991,991,991,991,19,19,981,991,991,991,991,991.0,18.0,99,9,1,1,32722-0001-Data.tsv,15.0,,ALCTRY/CIGAGE/CIGTRY,2010,Male,20,White,,20-21
20,16,18,991,991,991,17,18,991,991,991,991,991,16,991,18,981,17,991,991,991,991.0,18.0,99,7,2,1,32722-0001-Data.tsv,16.0,,CIGTRY/ECSAGE,2010,Female,18,White,,18-19


In [373]:
# <!-- # USEACM 2 1USED ALC CIG OR MJ 1ST WHEN [AFUFILL] YEARS OLD
# 1 = Alcohol........................................... 602 1.08
# 2 = Cigarettes........................................ 882 1.59
# 3 = Marijuana......................................... 161 0.29
# 4 = Alcohol LOGICALLY ASSIGNED........................ 5 0.01
# 5 = Cigarettes LOGICALLY ASSIGNED..................... 9 0.02
# 6 = Marijuana LOGICALLY ASSIGNED...................... 7 0.01
# 11 = Alcohol (see NOTE)................................ 8 0.01
# 12 = Cigarettes (see NOTE)............................. 9 0.02
# 13 = Marijuana (see NOTE).............................. 2 0.00
# 85 = BAD DATA Logically assigned....................... 7 0.01
# 91 = NEVER USED ALCOHOL/CIGARETTES/MARIJUANA........... 12767 22.96
# 94 = DON'T KNOW........................................ 7 0.01
# 97 = REFUSED........................................... 3 0.01
# 98 = BLANK (NO ANSWER)................................. 63 0.11 -->

In [356]:
275559 - 296467

-20908

In [293]:
# import pickle

# with open('fairman19marijuana_dataframe.pickle', 'wb') as pickle_file:
#     pickle.dump(df, pickle_file)

# check

In [332]:
df.head()

Unnamed: 0,YEAR,CLASS,SEX,RACE,AGE_GROUP,AGE,MINAGE
0,2010,CIGARETTES,Male,Black,20-21,20,16
1,2010,ALCOHOL,Female,Hispanic,14-15,15,14
2,2010,ALCOHOL,Female,White,18-19,18,13
3,2010,NO_DRUG_USE,Male,Hispanic,12-13,13,999
4,2010,CIGARETTES,Male,Hispanic,20-21,20,15


In [333]:
df[df.CLASS=='MARIJUANA'].MINAGE.value_counts()

14    3201
15    2890
13    2871
16    2041
12    1984
17     919
11     769
10     572
18     506
9      327
8      217
7      135
19     104
6       61
5       39
20      28
4       23
1       13
21       9
3        6
2        4
Name: MINAGE, dtype: int64

In [334]:
df.head()

Unnamed: 0,YEAR,CLASS,SEX,RACE,AGE_GROUP,AGE,MINAGE
0,2010,CIGARETTES,Male,Black,20-21,20,16
1,2010,ALCOHOL,Female,Hispanic,14-15,15,14
2,2010,ALCOHOL,Female,White,18-19,18,13
3,2010,NO_DRUG_USE,Male,Hispanic,12-13,13,999
4,2010,CIGARETTES,Male,Hispanic,20-21,20,15


In [335]:
mean_age_first_use_2004 = df[
    (df.CLASS == 'CIGARETTES') & (df.YEAR == 2004)]['MINAGE'].mean()
mean_age_first_use_2014 = main_df[
    (df.CLASS == 'CIGARETTES') & (df.YEAR == 2014)]['MINAGE'].mean()
mean_age_first_use_2004, mean_age_first_use_2014

(12.509315688664461, 13.232053422370617)

# temp

In [336]:
df[(df.CLASS == 'MARIJUANA') & (df.YEAR == 2004)].shape[0]*100/df[(df.YEAR == 2004)].shape[0]

5.049852951725128

In [337]:
df[(df.CLASS == 'MARIJUANA') & (df.YEAR == 2014)].shape[0]*100/df[(df.YEAR == 2014)].shape[0]

6.950191952934138

In [328]:
main_df[(main_df.CLASS == 'MARIJUANA')].shape[0]*100/main_df.shape[0]
main_df[(main_df.CLASS == 'ALCOHOL')].shape[0]*100/main_df.shape[0]
main_df[(main_df.CLASS == 'CIGARETTES')].shape[0]*100/main_df.shape[0]
# main_df[(main_df.CLASS == 'OTHER_DRUGS')].shape[0]*100/main_df.shape[0]
# main_df[(main_df.CLASS == 'OTHER_TABACCO')].shape[0]*100/main_df.shape[0]

16.85887468082451

In [353]:
main_df[(main_df.CLASS == 'MARIJUANA')].shape[0]*100/main_df.shape[0] # 5.8% <
main_df[(main_df.CLASS == 'ALCOHOL')].shape[0]*100/main_df.shape[0] # 29.8% for alcohol <
main_df[(main_df.CLASS == 'CIGARETTES')].shape[0]*100/main_df.shape[0] # 14.2% for cigarettes ~>
main_df[(main_df.CLASS == 'OTHER_DRUGS')].shape[0]*100/main_df.shape[0] # 3.6% for other tobacco >
main_df[(main_df.CLASS == 'OTHER_TABACCO')].shape[0]*100/main_df.shape[0] # 5.9% other drugs <

4.753648804082748