# Imports

In [1]:
import glob
import os
from functools import partial

# import modin.pandas as pd
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

# Load the data

Put the data in folder `data`. Extract every file in the folder with the same name. If needed, adapt the path.

In [87]:
tsv_paths = glob.glob("./data/*/*/*/*.tsv")

In [88]:
work_cols = ["WRKIDST2", "WRKOCUP2", "WRKOCUY2"]
survey_cols = ["ANALREC", "PRRECDK"]
# work_cols = ["WRKIDST2"]
# survey_cols = ["ANALREC"]
cols = work_cols + survey_cols

In [89]:
dfs = [
    pd.read_csv(path, sep="\t", skipinitialspace=True, usecols=cols)
    for path in tqdm(tsv_paths)
]

  0%|          | 0/11 [00:00<?, ?it/s]

In [90]:
df = pd.concat(dfs)

# Preprocess the data

```
ANALREC  2   TIME SINCE LAST USED PAIN RELIEVER NONMEDICALLY
              1 = Within the past 30 days...........................    1564   2.81
              2 = More than 30 days ago but within the past 12 mos..    2577   4.63
              3 = More than 12 months ago...........................    4684   8.42
              8 = Used at some point in the past 12 mos LOG ASSN....      39   0.07
              9 = Used at some point in lifetime LOG ASSN...........     297   0.53
             11 = Used in the past 30 days LOGICALLY ASSIGNED.......       8   0.01
             81 = NEVER USED PAIN RELIEVERS Logically assigned......      94   0.17
             85 = BAD DATA Logically assigned.......................       1   0.00
             91 = NEVER USED PAIN RELIEVERS.........................   46146  82.99
             97 = REFUSED...........................................       1   0.00
             98 = BLANK (NO ANSWER).................................     191   0.34


PRRECDK  2   BEST GUESS LAST USED PAIN RELIEVER NONMEDICALLY
              1 = Within the past 30 days -- that is, since DATEFILL       5   0.01
              2 = More than 30 days ago but within the past 12 month      16   0.03
              3 = More than 12 months ago...........................      32   0.06
             94 = DON'T KNOW........................................      68   0.12
             97 = REFUSED...........................................       3   0.01
             98 = BLANK (NO ANSWER).................................   55478  99.78


             Which of these categories best describes the
             business or industry in which you work?

WRKIDST2 4   TYPE OF BUSINESS OR INDUSTRY
              1 = Agriculture, Forestry, Fishing, & Hunting.........     549   0.99
              2 = Mining............................................      99   0.18
              3 = Construction......................................    2493   4.48
              4 = Manufacturing, Nondurable Goods...................    1019   1.83
              5 = Manufacturing, Durable Goods......................    1712   3.08
              6 = Transportation & Utilities........................    1072   1.93
              7 = Information & Communications......................     675   1.21
              8 = Wholesale Trade, Durable Goods....................     388   0.70
              9 = Wholesale Trade, Nondurable Goods.................     405   0.73
             10 = Retail Trade......................................    4336   7.80
             11 = Finance, Insur, Real Estate, Rental & Leasing.....    1519   2.73
             12 = Professional/Scientific/Mgmt/Admin/Waste Mgmt.....    2824   5.08
             13 = Education, Health & Social Services...............    5378   9.67
             14 = Arts/Entertain/Recreation/Accommodation/Food Svcs.    4766   8.57
             15 = Public Administration.............................    1063   1.91
             16 = Other Services (Except Public Admin)..............    1618   2.91
             17 = Armed Forces......................................      15   0.03
           9990 = NOT REPORTED/NOT CODABLE..........................      51   0.09
           9994 = DON'T KNOW........................................       3   0.01
           9997 = REFUSED...........................................      21   0.04
           9998 = BLANK (NO ANSWER).................................      18   0.03
           9999 = LEGITIMATE SKIP...................................   25578  46.00
```


In [91]:
WRKIDST2_mapping = {
    1: "Agriculture, Forestry, Fishing, & Hunting",
    2: "Mining",
    3: "Construction",
    4: "Manufacturing, Nondurable Goods",
    5: "Manufacturing, Durable Goods",
    6: "Transportation & Utilities",
    7: "Information & Communications",
    8: "Wholesale Trade, Durable Goods",
    9: "Wholesale Trade, Nondurable Goods",
    10: "Retail Trade",
    11: "Finance, Insur, Real Estate, Rental & Leasing",
    12: "Professional, Scientific, Mgmt, Admin, Waste Mgmt",
    13: "Education, Health & Social Services",
    14: "Arts, Entertain, Recreation, Accommodation, Food Svcs",
    15: "Public Administration",
    16: "Other Services (Except Public Admin)",
    17: "Armed Forces",
}

In [92]:
ANALREC_mapping = {1: "Past 30 days", 2: "Past 12 months"}

In [54]:

df = df[df["WRKIDST2"] <= 100]
df.loc[df["ANALREC"] > 2, "ANALREC"] = pd.NA
df = df.replace({"WRKIDST2": WRKIDST2_mapping, "ANALREC": ANALREC_mapping}).astype(
    "string"
)

In [59]:
df["ANALREC"].value_counts()

Past 12 months    14583
Past 30 days       8929
Name: ANALREC, dtype: Int64

In [60]:
df["WRKIDST2"].value_counts()

Education, Health & Social Services                  63283
Arts, Entertain, Recreation, Accommodation, Fo...    52506
Retail Trade                                         44900
Professional, Scientific, Mgmt, Admin, Waste Mgmt    30705
Construction                                         24403
Manufacturing, Durable Goods                         17413
Other Services (Except Public Admin)                 17079
Finance, Insur, Real Estate, Rental & Leasing        15966
Public Administration                                11956
Transportation & Utilities                           11259
Manufacturing, Nondurable Goods                      10548
Information & Communications                          6247
Agriculture, Forestry, Fishing, & Hunting             5860
Wholesale Trade, Nondurable Goods                     3923
Wholesale Trade, Durable Goods                        3150
Mining                                                1810
Armed Forces                                            

In [55]:
df

Unnamed: 0,ANALREC,WRKIDST2
1,,Construction
5,,"Wholesale Trade, Durable Goods"
6,,"Professional, Scientific, Mgmt, Admin, Waste Mgmt"
7,,"Arts, Entertain, Recreation, Accommodation, Fo..."
10,Past 12 months,Construction
...,...,...
55263,,"Professional, Scientific, Mgmt, Admin, Waste Mgmt"
55266,,Mining
55268,,"Wholesale Trade, Nondurable Goods"
55269,,"Manufacturing, Nondurable Goods"


# Create a pivot table

In [81]:
pt = df.pivot_table(index="ANALREC", columns="WRKIDST2", aggfunc="size")

In [82]:
pt = pd.crosstab(df["WRKIDST2"], df["ANALREC"], normalize="columns")

In [84]:
pt.sort_values("Past 12 months", axis=0, ascending=False)

ANALREC,Past 12 months,Past 30 days
WRKIDST2,Unnamed: 1_level_1,Unnamed: 2_level_1
"Arts, Entertain, Recreation, Accommodation, Fo...",0.245971,0.249188
Retail Trade,0.154564,0.152537
"Education, Health & Social Services",0.14318,0.130138
Construction,0.090791,0.107739
"Professional, Scientific, Mgmt, Admin, Waste Mgmt",0.089762,0.089148
Other Services (Except Public Admin),0.04999,0.053981
"Finance, Insur, Real Estate, Rental & Leasing",0.044847,0.03763
"Manufacturing, Durable Goods",0.041144,0.04715
"Manufacturing, Nondurable Goods",0.030035,0.027663
Transportation & Utilities,0.025989,0.025311


In [85]:
pt = pd.crosstab(df["WRKIDST2"], df["ANALREC"], normalize="index")

In [86]:
pt.sort_values("Past 12 months", axis=0, ascending=False)

ANALREC,Past 12 months,Past 30 days
WRKIDST2,Unnamed: 1_level_1,Unnamed: 2_level_1
Information & Communications,0.685466,0.314534
Armed Forces,0.666667,0.333333
"Finance, Insur, Real Estate, Rental & Leasing",0.660606,0.339394
"Education, Health & Social Services",0.642462,0.357538
Public Administration,0.641879,0.358121
"Manufacturing, Nondurable Goods",0.639416,0.360584
"Wholesale Trade, Nondurable Goods",0.638298,0.361702
Transportation & Utilities,0.626446,0.373554
Retail Trade,0.623341,0.376659
"Professional, Scientific, Mgmt, Admin, Waste Mgmt",0.621853,0.378147
