# Imports

In [1]:
import glob
import os
from collections import defaultdict
from functools import partial

# import modin.pandas as pd
import pandas as pd
import plotly.express as px
from IPython.display import Image
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

# Load the data

Put the data in folder `data`. Extract every file in the folder with the same name. If needed, adapt the path.

In [2]:
tsv_paths = glob.glob("../zhu21factors/data/*/*/*/*.tsv")
assert len(tsv_paths) == 11

In [3]:
work_cols = ["WRKIDST2", "WRKIDSY2"]
filter_cols = ["AGE2", "JBSTATR2"]
survey_cols = ["ANALREC"]
cols = work_cols + survey_cols + filter_cols

In [4]:
dfs = process_map(
    partial(pd.read_csv, sep="\t", skipinitialspace=True, usecols=cols), tsv_paths
)

  0%|          | 0/11 [00:00<?, ?it/s]

In [12]:
df = pd.concat(dfs)

In [13]:
df

Unnamed: 0,ANALREC,AGE2,JBSTATR2,WRKIDST2,WRKIDSY2
0,91,2,11,9999,9999
1,91,8,1,3,9999
2,91,5,7,9999,9999
3,91,15,9,9999,3
4,91,7,7,9999,3
...,...,...,...,...,...
55266,91,11,1,2,9999
55267,2,15,5,9999,9999
55268,91,15,1,9,9999
55269,91,10,2,4,9999


# Preprocess the data

## Filter the data

**Methods—** Nationally-representative, cross-sectional survey data from the US National Survey 
on Drug Use and Health supplied self-reported age-of-first marijuana, cigarettes, alcohol, other 
tobacco, and other illegal drug use among 12–21-year-old samples from 2004 to 2014 
(n=275,559). We first examined the degree to which initiating marijuana use first was associated 
with sex, age, race/ethnicity, and survey year. Then, we examined whether using marijuana first 
predicted heavy marijuana use, cannabis use disorder (CUD), alcohol use disorder (AUD), 
nicotine dependence (ND), or lifetime use of other illegal drugs.
Results—Among all survey youth (substance users and non-users) the proportion using 

```
AGE2     2   RECODE - FINAL EDITED AGE 
              1 = Respondent is 12 years old........................    2874   5.17 
              2 = Respondent is 13 years old........................    3186   5.73 
              3 = Respondent is 14 years old........................    3139   5.65 
              4 = Respondent is 15 years old........................    3116   5.60 
              5 = Respondent is 16 years old........................    3010   5.41 
              6 = Respondent is 17 years old........................    2969   5.34 
              7 = Respondent is 18 years old........................    2686   4.83 
              8 = Respondent is 19 years old........................    2370   4.26 
              9 = Respondent is 20 years old........................    2255   4.06 
             10 = Respondent is 21 years old........................    2277   4.10 
             11 = Respondent is 22 or 23 years old..................    4545   8.17 
             12 = Respondent is 24 or 25 years old..................    4342   7.81 
             13 = Respondent is between 26 and 29 years old.........    2609   4.69 
             14 = Respondent is between 30 and 34 years old.........    3101   5.58 
             15 = Respondent is between 35 and 49 years old.........    8052  14.48 
             16 = Respondent is between 50 and 64 years old.........    3072   5.52 
             17 = Respondent is 65 years old or older...............    1999   3.60 


JBSTATR2 3   WORK SITUATION IN PAST WEEK - RECODE 
              1 = Worked at full-time job, past week................   19191  34.51 
              2 = Worked at part time job, past week................    8311  14.95 
              3 = Has job or volunteer worker, did not work past wk.    2385   4.29 
              4 = Unemployed/on layoff, looking for work............    2867   5.16 
              5 = Disabled..........................................    1193   2.15 
              6 = Keeping house full-time...........................    2396   4.31 
              7 = In school/training................................    5881  10.58 
              8 = Retired...........................................    1787   3.21 
              9 = Does not have a job, some other reason............    2371   4.26 
             10 = MISSING...........................................      21   0.04 
             11 = LEGITIMATE SKIP...................................    9199  16.54
```

In [17]:
print(f"{df.shape[0]=}")
print(f"{df[df['AGE2'] <= 10].shape=}")
print(f"{df[df['AGE2'] < 10].shape=}")

df.shape[0]=613974
df[df['AGE2'] <= 10].shape=(296467, 5)
df[df['AGE2'] < 10].shape=(271748, 5)
