In [9]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN_READ_ONLY")

In [10]:
from huggingface_hub import login

login(token=hf_token)

In [11]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("rtweera/nhanes-training-merged", split="train") # Example for a dataset on the Hub

In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def classify_row(row):
    # Step 1: invalid responses
    if row["DIQ050__questionnaire"] in [7, 9] or row["DIQ070__questionnaire"] in [7, 9] or row["DIQ060__questionnaire"] > row["DIQ010__questionnaire"]:
        return "Excluded"
    
    # Step 2: no insulin & no oral meds
    if row["DIQ050__questionnaire"] == 2 and row["DIQ070__questionnaire"] == 2:
        return "T2D"
    
    # Step 3: not taking insulin
    if row["DIQ050__questionnaire"] == 2:
        return "T2D"
    
    # Step 4: both insulin + oral meds
    if row["DIQ050__questionnaire"] == 1 and row["DIQ070__questionnaire"] == 1:
        return "T2D"
    
    # Step 5: only insulin
    if row["DIQ050__questionnaire"] == 1 and row["DIQ070__questionnaire"] == 2:
        if (row["DIQ010__questionnaire"] - row["DIQ060__questionnaire"]) <= 1:
            return "T1D"
        else:
            return "Possible-T2D"
    
    return "Excluded"

def classify_with_progress(df):
    tqdm.pandas(desc="Classifying diabetes types")
    df["Diabetes_Type"] = df.progress_apply(classify_row, axis=1)
    return df

In [12]:
df = dataset.to_pandas()
del dataset

In [13]:
df.shape

(101316, 4956)

In [15]:
df = classify_with_progress(df)

print(df["Diabetes_Type"].value_counts())

Classifying diabetes types: 100%|██████████| 101316/101316 [01:16<00:00, 1320.25it/s]

Diabetes_Type
T2D             78212
Excluded        23043
T1D                56
Possible-T2D        5
Name: count, dtype: int64





In [16]:
df.to_parquet('diabetes-type-classified-training-dataset.parquet')