In [1]:
import numpy as np
import pandas as pd
import glob
import os
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Merge historical data

data can be found on [ICPSR website](https://www.icpsr.umich.edu/web/ICPSR/series/64/studies)

In [2]:
%%time

dfs = []
files = glob.glob('../../data/*.tsv')
fields = ['NEWRACE', 'AGE', 'IRSEX', 'HERAGE', 'LSDAGE', 'PCPAGE',
          'CRKAGE', 'ECSAGE', 'COCAGE', 'METHAGE', 'CIGAGE', 'SNUFTRY',
          'CHEWTRY', 'MTHAAGE', 'OXYCAGE', 'CIGTRY ', 'SEDAGE', 'STIMAGE',
          'TRANAGE', 'CIGARTRY', 'INHAGE', 'MJAGE', 'ANALAGE', 'BLNTAGE', 'ALCTRY'] #+ \
    #['MRJMDAYS',  'MRJYDAYS', 'DEPNDALC', 'DEPNDMRJ', 'NDSSDNSP']

for file in files:
    df_n1 = pd.read_csv(file, sep='\t', skipinitialspace=True, nrows=1)
    current_columns = []
    for field in fields:
        if field in df_n1.columns:
            current_columns.append(field)
        elif f'{field}2' in df_n1.columns:
            current_columns.append(f'{field}2')
        else:
            print(f'field {field} not in {file}')
    df = pd.read_csv(file, sep='\t', skipinitialspace=True, usecols=current_columns)
    df['file_name'] = os.path.basename(file)
    dfs.append(df)

field CIGTRY  not in ../../data/32722-0001-Data.tsv
field CIGTRY  not in ../../data/23782-0001-Data.tsv
field MTHAAGE not in ../../data/04596-0001-Data.tsv
field CIGTRY  not in ../../data/04596-0001-Data.tsv
field CIGTRY  not in ../../data/26701-0001-Data.tsv
field CIGTRY  not in ../../data/29621-0001-Data.tsv
field CIGTRY  not in ../../data/36361-0001-Data.tsv
field CIGTRY  not in ../../data/35509-0001-Data.tsv
field MTHAAGE not in ../../data/04373-0001-Data.tsv
field CIGTRY  not in ../../data/04373-0001-Data.tsv
field BLNTAGE not in ../../data/04373-0001-Data.tsv
field MTHAAGE not in ../../data/21240-0001-Data.tsv
field CIGTRY  not in ../../data/21240-0001-Data.tsv
field CIGTRY  not in ../../data/34481-0001-Data.tsv
field CIGTRY  not in ../../data/34933-0001-Data.tsv
CPU times: user 16.8 s, sys: 671 ms, total: 17.4 s
Wall time: 17.4 s


In [3]:
main_df = pd.concat(dfs)
main_df_backlog = main_df.copy()
main_df.shape  # data diff 613774 (real) - 615701 (paper) = -1927 difference

(613974, 25)

In [4]:
main_df.head(2)

Unnamed: 0,CIGAGE,SNUFTRY,CHEWTRY,CIGARTRY,ALCTRY,MJAGE,COCAGE,CRKAGE,HERAGE,LSDAGE,PCPAGE,ECSAGE,INHAGE,ANALAGE,OXYCAGE,TRANAGE,STIMAGE,METHAGE,SEDAGE,MTHAAGE,BLNTAGE,AGE2,IRSEX,NEWRACE2,file_name
0,999,991,991,991,16,16,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,991.0,9,1,2,32722-0001-Data.tsv
1,15,991,991,991,13,14,991,991,991,991,991,991,991,991,991,991,991,991,991,991.0,17.0,12,2,1,32722-0001-Data.tsv


# Preprocessing

In [5]:
file_to_year_mapping = {f:2004+i for i, f in enumerate(sorted([os.path.basename(f) for f in files]))}
class_mapping = {
    'MJAGE': 'MARIJUANA',
    'ALCTRY': 'ALCOHOL',
    'CIGAGE': 'CIGARETTES',
    'NOUSAGE': 'NO_DRUG_USE',
    'CIGARTRY': 'OTHER_TABACCO',
    'SNUFTRY': 'OTHER_TABACCO',
    'CHEWTRY': 'OTHER_TABACCO',
    'HERAGE': 'OTHER_DRUGS',
    'LSDAGE': 'OTHER_DRUGS',
    'PCPAGE': 'OTHER_DRUGS',
    'CRKAGE': 'OTHER_DRUGS',
    'ECSAGE': 'OTHER_DRUGS',
    'COCAGE': 'OTHER_DRUGS',
    'METHAGE': 'OTHER_DRUGS',
    'MTHAAGE': 'OTHER_DRUGS',
    'OXYCAGE': 'OTHER_DRUGS',
    'SEDAGE': 'OTHER_DRUGS',
    'STIMAGE': 'OTHER_DRUGS',
    'TRANAGE': 'OTHER_DRUGS',
    'INHAGE': 'OTHER_DRUGS',
    'ANALAGE': 'OTHER_DRUGS',
    'BLNTAGE': 'OTHER_DRUGS'
}
age_to_group_mapping = {
    12: '12-13',
    13: '12-13',
    14: '14-15',
    15: '14-15',
    16: '16-17',
    17: '16-17',
    18: '18-19',
    19: '18-19',
    20: '20-21',
    21: '20-21'
}
classes = ['MJAGE', 'CIGAGE', 'ALCTRY', 'CIGARTRY', 'SNUFTRY',
           'CHEWTRY', 'HERAGE', 'LSDAGE', 'PCPAGE', 'CRKAGE',
           'ECSAGE', 'COCAGE', 'METHAGE', 'MTHAAGE', 'OXYCAGE',
           'SEDAGE', 'STIMAGE', 'TRANAGE', 'INHAGE', 'ANALAGE',
           'BLNTAGE']
classes_new = classes + ['NOUSAGE']

def prepare_data(df):
    df['YEAR'] = df['file_name'].map(file_to_year_mapping) # infer year
    df[['MTHAAGE', 'BLNTAGE']] = df[['MTHAAGE', 'BLNTAGE']].fillna(10e5)  # fill in nan
    # data diff 296467 (real) - 297632 (paper) = -1165 
    df = df[(df['AGE2'] < 11)]  # filter people < 22 yo
    df['ARGMINAGE'] = df[classes].values.argmin(axis=1)  # index of substance that was used first
    df['MINAGE'] = df[classes].values.min(axis=1)  # substance that was used first
    df['ARGMINAGE'] = np.where(df['MINAGE'] > 900, 21, df['ARGMINAGE'])  # values > 900 - no drug usage
    # mapping to good-looking values
    df['SEX'] = df['IRSEX'].map({1: 'Male', 2: 'Female'})
    df['AGE'] = df['AGE2'].map({i:i+11 for i in range(1, 11)})
    df['RACE'] = df['NEWRACE2'].map({1: 'White', 2: 'Black', 3: 'AI/AN', 4: 'NHOPI', 5: 'Asian', 6: 'Multi-racial', 7: 'Hispanic'})
    df['CLASS_NARROW'] = df['ARGMINAGE'].map(lambda x: classes_new[x])
    df['CLASS'] = df['CLASS_NARROW'].map(class_mapping)
    df['AGE_GROUP'] = df['AGE'].map(age_to_group_mapping)
    df.reset_index(inplace=True, drop=True)
    df = df[['YEAR', 'CLASS', 'SEX', 'RACE', 'AGE_GROUP', 'AGE']]
    # make values categorical
    df['SEX'] = df['SEX'].astype('category')
    df['RACE'] = df['RACE'].astype('category')
    df['AGE_GROUP'] = df['AGE_GROUP'].astype('category')
    df['CLASS'] = df['CLASS'].astype('category')
    df['YEAR'] = df['YEAR'].astype('category')
    return df

In [6]:
df = prepare_data(main_df)
df.to_csv('../../data/nsduh_processed_data.csv')
# df = pd.read_csv('../../data/nsduh_processed_data.csv', index_col=0)
df.shape

(296467, 6)

In [7]:
df.head()

Unnamed: 0,YEAR,CLASS,SEX,RACE,AGE_GROUP,AGE
0,2010,MARIJUANA,Male,Black,20-21,20
1,2010,MARIJUANA,Female,Hispanic,14-15,15
2,2010,ALCOHOL,Female,White,18-19,18
3,2010,NO_DRUG_USE,Male,Hispanic,12-13,13
4,2010,OTHER_TABACCO,Male,Hispanic,20-21,20
