# Imports

In [1]:
import glob
import os
from itertools import product

# import modin.pandas as pd
import pandas as pd
import plotly.express as px

# from modin.config import ProgressBar
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

# ProgressBar.enable()

# Determine the correct combination of datasets

Put the data in folder `data`. Extract every file in the folder with the same name. If needed, adapt the path.

In [2]:
tsv_paths = sorted(
    glob.glob("../marijuana19fairman/data/NSDUH_Versions/*/*/*.tsv")
    + glob.glob("../marijuana19fairman/data/NSDUH_Versions/*/*/*/*.tsv")
)
tsv_paths_full = [path for path in tsv_paths if "Tab" in path and "padded" not in path]
tsv_paths = [
    path
    for path in tsv_paths
    if "Tab" not in path
    and "padded" not in path
    and any(f"{year}.tsv" in path for year in range(2007, 2015))
]

# tsv_paths

The analytic sample included 303,311 participants who responded to all necessary survey 
items (or 98.5% of the 307,935, starting sample). The absolute percentage differences of 
missing data between men (2%), women (1%), rural (1.4%) and urban (1.5%) participants.

In [3]:
work_cols = ["WRKIDST2", "WRKIDSY2"]
filter_cols = ["AGE2", "JBSTATR2"]
survey_cols = ["ANALREC"]
cols = work_cols + survey_cols + filter_cols

In [15]:
version_mapping = {
    "2002-2014": "1",
    "2002-2015": "2",
    "2002-2017": "3",
    "2002-2018": "4",
    "2002-2019": "5",
}

Method: We used yearly cross-sectional data (n = 303,311) from the U.S. National Survey on 
Drug Use and Health (NSDUH) from 2007 through 2014 to compare cigarette smoking trends in 
men and women across rural and urban areas. Current smoking status was modelled using logistic 

Data came from the National Survey on Drug Use and Health (NSDUH)—a U.S. nationally 
representative cross-sectional survey that has been conducted annually since 1990. To ensure 
comparability with Doogan et al. (2017), we restricted our analyses to years 2007 through 
2014, and to adults ( ≥ 18 years). We used participant weights included with the survey data 
to obtain results representative of the US population by correcting for selection probabilities, 

```
AGE2     2   RECODE - FINAL EDITED AGE 
              1 = Respondent is 12 years old........................    2874   5.17 
              2 = Respondent is 13 years old........................    3186   5.73 
              3 = Respondent is 14 years old........................    3139   5.65 
              4 = Respondent is 15 years old........................    3116   5.60 
              5 = Respondent is 16 years old........................    3010   5.41 
              6 = Respondent is 17 years old........................    2969   5.34 
              7 = Respondent is 18 years old........................    2686   4.83 
              8 = Respondent is 19 years old........................    2370   4.26 
              9 = Respondent is 20 years old........................    2255   4.06 
             10 = Respondent is 21 years old........................    2277   4.10 
             11 = Respondent is 22 or 23 years old..................    4545   8.17 
             12 = Respondent is 24 or 25 years old..................    4342   7.81 
             13 = Respondent is between 26 and 29 years old.........    2609   4.69 
             14 = Respondent is between 30 and 34 years old.........    3101   5.58 
             15 = Respondent is between 35 and 49 years old.........    8052  14.48 
             16 = Respondent is between 50 and 64 years old.........    3072   5.52 
             17 = Respondent is 65 years old or older...............    1999   3.60 

```

In [6]:
numbers = {year: {} for year in range(2007, 2015)}

for tsv_path in tqdm(tsv_paths):
    parent_dir = os.path.dirname(tsv_path)
    with open(os.path.join(parent_dir, "schema.txt"), "r") as f:
        dtype = eval(f.read())
        dtype = {k: v if v != int else float for k, v in dtype.items()}
    df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)

    year = int(tsv_path.split("/")[-1].removesuffix(".tsv"))
    version = None
    for pat, v in version_mapping.items():
        if pat in tsv_path:
            version = v
            break
    assert version is not None

    if pd.api.types.is_object_dtype(df["AGE2"].dtype):
        df["AGE2"] = pd.to_numeric(df["AGE2"].str.split().str[0], errors="coerce")
        df = df[df["AGE2"].notna()]
    numbers[year][version] = df[df["AGE2"] >= 7].shape[0]

  0%|          | 0/40 [00:00<?, ?it/s]

  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="\t", dtype=dtype, index_col=False)
  df = pd.read_csv(tsv_path, sep="

In [33]:
import csv

for tsv_path in tsv_paths_full:
    with open(tsv_path, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        data = []
        for i, row in enumerate(reader):
            data.append(row)
            if i >= 5:
                break
    with open(tsv_path, "r") as f:
        data2 = []
        for i, row in enumerate(f):
            data2.append(row[:-1].split("\t"))
            if i >= 5:
                break

    for r1, r2 in zip(data, data2):
        for v1, v2 in zip(r1, r2):
            if v1 != v2:
                print(f"({v1}), ({v2})")
            assert v1 == v2
    assert data == data2
    print([len(row) for row in data])

[3625, 1375, 1386, 1549, 1358, 1363]
[3518, 1375, 1386, 1549, 1358, 1363]
[3661, 4705, 4705, 4705, 4705, 4705]
[3662, 4735, 4735, 4735, 4735, 4735]
[3660, 4741, 4741, 4741, 4741, 4741]


In [None]:
tsv_path = tsv_paths[-1]

import pandas as pd

parent_dir = os.path.dirname(tsv_path)
with open(os.path.join(parent_dir, "schema.txt"), "r") as f:
    dtype = eval(f.read())
    dtype = {k: v if v != int else float for k, v in dtype.items()}
df = pd.read_csv(
    tsv_path, sep="\t", dtype=dtype, nrows=1, engine="python", index_col=False
)

In [40]:
df['PDEN10'].value_counts()

KeyError: 'PDEN10'

In [14]:
numbers

{2007: {'1': 37400, '2': 37400, '3': 37427, '4': 37427, '5': 37427},
 2008: {'1': 37459, '2': 37459, '3': 37504, '4': 37504, '5': 37504},
 2009: {'1': 37698, '2': 37698, '3': 37707, '4': 37707, '5': 37707},
 2010: {'1': 38904, '2': 38904, '3': 38919, '4': 38919, '5': 38919},
 2011: {'1': 39118, '2': 39118, '3': 39133, '4': 39133, '5': 39133},
 2012: {'1': 37855, '2': 37855, '3': 37869, '4': 37869, '5': 37869},
 2013: {'1': 37407, '2': 37407, '3': 37424, '4': 37424, '5': 37424},
 2014: {'1': 41639, '2': 41639, '3': 41671, '4': 41671, '5': 41671}}

In [36]:
numbers_ = [[v for v in vals.values()] for vals in numbers.values()]

from itertools import product

for combination in tqdm(set(product(*numbers_))):
    if abs(sum(combination) - 303_311) < 300:
        print("1", combination)
    if abs(sum(combination) - 307_935) < 300:
        print("2", combination)

  0%|          | 0/256 [00:00<?, ?it/s]

2 (37427, 37504, 37707, 38919, 39133, 37855, 37424, 41671)
2 (37427, 37504, 37707, 38919, 39118, 37869, 37424, 41671)
2 (37427, 37504, 37707, 38919, 39133, 37869, 37407, 41671)
2 (37427, 37504, 37707, 38904, 39133, 37869, 37424, 41671)
2 (37427, 37504, 37707, 38919, 39133, 37869, 37424, 41671)
2 (37427, 37504, 37698, 38919, 39133, 37869, 37424, 41671)


# For now work with the V1 data

## Check the descriptive statistics