In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import re

In [2]:
dataset = load_dataset("AnnDinoushka/nhanes-training-merged-new", split="train")
df = dataset.to_pandas()
df.shape

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

merged.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/101316 [00:00<?, ? examples/s]

(101316, 4928)

In [3]:
df['DIQ010__questionnaire'].value_counts(dropna=False)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


DIQ010__questionnaire
2.0    83270
NaN    10282
1.0     6640
3.0     1068
9.0       53
7.0        3
Name: count, dtype: int64

In [19]:
df['DIQ050__questionnaire'].value_counts(dropna=False)

DIQ050__questionnaire
2.0    78117
NaN    21258
1.0     1931
9.0        6
7.0        4
Name: count, dtype: int64

In [21]:
df['DID040__questionnaire'].value_counts(dropna=False)

DID040__questionnaire
NaN     94772
50.0      401
60.0      310
55.0      278
40.0      275
        ...  
84.0        4
88.0        2
83.0        2
86.0        1
87.0        1
Name: count, Length: 91, dtype: int64

In [20]:
age_at_diabetes = df['DID040__questionnaire'].value_counts(dropna=False)
age_at_diabetes[age_at_diabetes.index > 90]

# NOTE: NO incorrect ages here, all in years

  return op(a, b)


DID040__questionnaire
999.0    67
666.0    13
Name: count, dtype: int64

In [10]:
df['DID040__questionnaire'].value_counts(dropna=False)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


DID040__questionnaire
NaN     94772
50.0      401
60.0      310
55.0      278
40.0      275
        ...  
84.0        4
88.0        2
83.0        2
86.0        1
87.0        1
Name: count, Length: 91, dtype: int64

In [23]:
df["DIQ060__questionnaire"].value_counts(dropna=False)

DIQ060__questionnaire
NaN      99396
1.0        169
2.0        166
3.0        151
10.0       149
         ...  
228.0        1
372.0        1
49.0         1
192.0        1
50.0         1
Name: count, Length: 84, dtype: int64

In [40]:
df["DIQ060__questionnaire"].value_counts(dropna=False).index.max()

999.0

In [16]:
insulin_treatment_period = df["DIQ060__questionnaire"].value_counts()
insulin_treatment_period[insulin_treatment_period.index > 90]

DIQ060__questionnaire
666.0    23
999.0    17
240.0    11
96.0     11
120.0     9
180.0     8
360.0     6
144.0     6
132.0     4
777.0     3
108.0     3
156.0     3
216.0     2
264.0     2
312.0     2
336.0     1
324.0     1
300.0     1
204.0     1
276.0     1
168.0     1
252.0     1
588.0     1
552.0     1
228.0     1
372.0     1
192.0     1
Name: count, dtype: int64

In [24]:
df["DIQ060U__questionnaire"].value_counts(dropna=False)

DIQ060U__questionnaire
NaN    99435
2.0     1453
1.0      428
Name: count, dtype: int64

In [26]:
screening_age_years = df["RIDAGEYR__demographics"].value_counts(dropna=False)
screening_age_years

RIDAGEYR__demographics
0     3696
2     3324
1     3102
80    2524
4     2510
      ... 
90      89
86      64
87      55
88      33
89      30
Name: count, Length: 91, dtype: int64

In [36]:
np.nan in screening_age_years.index

False

In [32]:
screening_age_years[screening_age_years.index==80]

RIDAGEYR__demographics
80    2524
Name: count, dtype: int64

## Rule based classification of diabetes type

**NOTE**: The number to meaning mapping can be found here: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DIQ.htm

Based on questionnaire responses:
- DIQ010__questionnaire: Doctor told you have diabetes?
    - 1 = Yes
    - 2 = No
    - 3 = Borderline
    - 7 = Refused
    - 9 = Don't know
    - NaN = Missing
- DID040__questionnaire: Age at diagnosing diabetes
    - 1 to 79 = Actual age
    - 80 = 80 or above
    - 666 = Less than 1 year
    - 777 = Refused
    - 999 = Don't know
    - NaN = Missing
- DIQ050__questionnaire: Are you currently taking insulin?
    - 1 = Yes
    - 2 = No
    - 7 = Refused
    - 9 = Don't know
    - NaN = Missing
- DIQ060__questionnaire: How long been taking insulin? (number of years or months - refer to DIQ060U)
    - 666 = Less than 1 month
    - 777 = Refused
    - 999 = Don't know
    - NaN = Missing
    - Other values = Actual values (years or months)
- DIQ060U__questionnaire: Unit for DIQ060
    - 1 = Months
    - 2 = Years
    - NaN = Missing
- DIQ070__questionnaire: Are you now taking diabetic pills to lower your blood sugar?
  - 1 = Yes
  - 2 = No
  - 7 = Refused
  - 9 = Don't know
  - NaN = Missing

**NOTE**: The number to meaning mapping can be found here: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm

- RIDAGEYR__demographics: Age in years at screening
  - 0 to 79 = Actual age
  - 80 = 80 or above (in newer nhanes)
  - 80 to 90 = Actual age (in older nhanes)
  - No missing values

### Logic

Only 55 people are refused or don't know for doctor told you have diabetes, so okay to skip them

```pseudocode
FOR EACH ROW:
    # Step 0 — initial filter
    IF DIQ010 != 1 THEN
        type = "Not diabetic / Borderline / Skipped"
        CONTINUE
    ENDIF

    # Step 1 — Exclusions
    # Treatment responses must not be 7 or 9
    IF DIQ050 in {7, 9} OR DIQ070 in {7, 9} THEN
        type = "Excluded"
        CONTINUE
    ENDIF

    # Insulin duration only matters if currently taking insulin
    years_taking_insulin = 0

    IF DIQ050 == 1 THEN
        # Need a valid DIQ060 (not refused/don’t know)
        IF DIQ060 in {777, 999} THEN
            type = "Excluded"
            CONTINUE
        ENDIF

        # compute years_taking_insulin
        IF DIQ060 == 666 THEN
            years_taking_insulin = 0.5/12
        ELSE
            IF DIQ060U == 1 THEN
                years_taking_insulin = DIQ060 / 12
            ELSE IF DIQ060U == 2 THEN
                years_taking_insulin = DIQ060
            ELSE
                # ambiguous unit, can exclude or treat as missing
                type = "Excluded"
                CONTINUE
            ENDIF
        ENDIF

        # Check “longer than time since diagnosis” only if DID040 is valid
        IF DID040 not in {777, 999} AND NOT MISSING THEN
            age_at_dx = (DID040 == 666 ? 6/12 : DID040)
            time_since_dx = RIDAGEYR - age_at_dx

            IF years_taking_insulin > time_since_dx THEN
                type = "Excluded"
                CONTINUE
            ENDIF
        ENDIF
    ENDIF

    # Now we are in analytic cohort

    # Step 2–4 — T2D based on treatment pattern
    IF DIQ050 == 2 AND DIQ070 == 2 THEN
        type = "T2D"
        CONTINUE
    ENDIF

    IF DIQ050 == 2 AND DIQ070 == 1 THEN
        type = "T2D"
        CONTINUE
    ENDIF

    IF DIQ050 == 1 AND DIQ070 == 1 THEN
        type = "T2D"
        CONTINUE
    ENDIF

    # Step 5 — Insulin-only group (DIQ050 == 1, DIQ070 == 2)
    # Need valid DID040 here
    IF DID040 in {777, 999} OR MISSING THEN
        type = "Excluded"    # or keep as "Unclassified"
        CONTINUE
    ENDIF

    age_at_dx = (DID040 == 666 ? 6/12 : DID040)
    years_since_dx = RIDAGEYR - age_at_dx

    IF years_since_dx - years_taking_insulin <= 1 THEN
        type = "T1D"
    ELSE
        type = "Possible-T2D"
    ENDIF
ENDFOR
```

In [13]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def classify_diabetes_type(row):
    """
    Classify diabetes type based on questionnaire responses.
    
    Returns: String indicating diabetes classification
    """
    
    # Step 0 — Initial filter
    if pd.isna(row["DIQ010__questionnaire"]) or row["DIQ010__questionnaire"] in [7, 9]:
        return "Skipped"
    if row["DIQ010__questionnaire"] == 2:
        return "Not diabetic"
    if row["DIQ010__questionnaire"] == 3:
        return "Borderline"
    
    # Step 1 — Exclusions
    # Treatment responses must not be 7 or 9
    if (row["DIQ050__questionnaire"] in [7, 9] or 
        row["DIQ070__questionnaire"] in [7, 9]):
        return "Excluded"
    
    # Insulin duration only matters if currently taking insulin
    years_taking_insulin = 0
    
    if row["DIQ050__questionnaire"] == 1:
        # Need a valid DIQ060 (not refused/don't know)
        if pd.isna(row["DIQ060__questionnaire"]) or row["DIQ060__questionnaire"] in [777, 999]:
            return "Excluded"
        
        # Compute years_taking_insulin
        if row["DIQ060__questionnaire"] == 666:
            years_taking_insulin = 0.5 / 12
        else:
            if row["DIQ060U__questionnaire"] == 1:
                years_taking_insulin = row["DIQ060__questionnaire"] / 12
            elif row["DIQ060U__questionnaire"] == 2:
                years_taking_insulin = row["DIQ060__questionnaire"]
            else:
                # Ambiguous unit
                return "Excluded"
        
        # Check "longer than time since diagnosis" only if DID040 is valid
        if (not pd.isna(row["DID040__questionnaire"]) and 
            row["DID040__questionnaire"] not in [777, 999]):
            
            age_at_dx = 6/12 if row["DID040__questionnaire"] == 666 else row["DID040__questionnaire"]
            time_since_dx = row["RIDAGEYR__demographics"] - age_at_dx
            
            if years_taking_insulin > time_since_dx:
                return "Excluded"
    
    # Now we are in analytic cohort
    
    # Step 2–4 — T2D based on treatment pattern
    if row["DIQ050__questionnaire"] == 2 and row["DIQ070__questionnaire"] == 2:
        return "T2D"
    
    if row["DIQ050__questionnaire"] == 2 and row["DIQ070__questionnaire"] == 1:
        return "T2D"
    
    if row["DIQ050__questionnaire"] == 1 and row["DIQ070__questionnaire"] == 1:
        return "T2D"
    
    # Step 5 — Insulin-only group (DIQ050 == 1, DIQ070 == 2)
    # Need valid DID040 here
    if pd.isna(row["DID040__questionnaire"]) or row["DID040__questionnaire"] in [777, 999]:
        return "Excluded"
    
    age_at_dx = 6/12 if row["DID040__questionnaire"] == 666 else row["DID040__questionnaire"]
    years_since_dx = row["RIDAGEYR__demographics"] - age_at_dx
    
    if years_since_dx - years_taking_insulin <= 1:
        return "T1D"
    else:
        return "Possible-T2D"


def classify_with_progress(df):
    """
    Apply classification to entire dataframe with progress bar.
    
    Args:
        df: DataFrame containing questionnaire and demographic data
        
    Returns:
        DataFrame with added 'Diabetes_Type' column
    """
    tqdm.pandas(desc="Classifying diabetes types")
    df["Diabetes_Type"] = df.progress_apply(classify_diabetes_type, axis=1)
    return df


# Example usage:
from datasets import load_dataset

dataset = load_dataset("AnnDinoushka/nhanes-training-merged-new", split="train")
df = dataset.to_pandas()
df = classify_with_progress(df)
print(df["Diabetes_Type"].value_counts())

Classifying diabetes types: 100%|██████████| 101316/101316 [00:42<00:00, 2395.82it/s]

Diabetes_Type
Not diabetic    83270
Skipped         10338
T2D              5424
Borderline       1068
Possible-T2D      499
Excluded          441
T1D               276
Name: count, dtype: int64





In [14]:
df.to_parquet("diabetes-type-classified.parquet")

In [11]:
dataset = load_dataset("AnnDinoushka/nhanes-training-merged-new", split="train")
df = dataset.to_pandas()

# Assuming your DataFrame is named 'df'
column_name = "DID040__questionnaire"

# 1. Check for NaN values OR values in the list [777, 999]
condition = (pd.isna(df[column_name])) | (df[column_name].isin([777, 999]))

# 2. Count the number of True values (rows that meet the condition)
condition.sum()

94839

In [5]:
df[df['DIQ060U__questionnaire'].isna()]['DIQ060__questionnaire'].describe()

count     39.000000
mean     802.615356
std      165.937302
min      666.000000
25%      666.000000
50%      666.000000
75%      999.000000
max      999.000000
Name: DIQ060__questionnaire, dtype: float64

In [7]:
df['DIQ060U__questionnaire'].value_counts(dropna=False)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


DIQ060U__questionnaire
NaN    99435
2.0     1453
1.0      428
Name: count, dtype: int64