# Cleaning Data and Creating DataLoader Function

## Imports

In [1]:
import os
import io
import sys
from pathlib import Path
from importlib.metadata import version
from logging import Logger
from typing import List, Optional
import logging
import joblib
import pandas as pd
from pandas.errors import ParserError
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
packages = ["pandas", "importlib-metadata", "pyarrow"]
for package in packages:
    try:
        logger.info(f"{package} version: {version(package)}")
    except Exception as e:
        logger.warning(f"Could not get version for package {package}: {e}")

INFO:__main__:pandas version: 2.3.2
INFO:__main__:importlib-metadata version: 8.7.0
INFO:__main__:pyarrow version: 21.0.0


## Load Dataframe from csv file in local directory

In [4]:
DATA_ROOT = Path("../Data")
RAW_DATA_DIR_NAME = "Downloaded-Data"

DATA_RAW_FILE_NAME = "data-RAW.csv"
DATA_INTERMEDIATE_FILE_NAME = "data-INTERMEDIATE.csv"
DATA_CLEAN_FILE_NAME = "data-CLEAN.csv"

RAW_DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_RAW_FILE_NAME
INTERMEDIATE_DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_INTERMEDIATE_FILE_NAME
DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_CLEAN_FILE_NAME

MAPPING_DIR_NAME = "Feature-Mapping"

FEATURE_MAPPING_FILE_NAME = "feature_mappings.json"

FEATURE_MAPPING_PATH = DATA_ROOT / MAPPING_DIR_NAME / FEATURE_MAPPING_FILE_NAME

In [5]:
df = pd.read_csv(RAW_DATA_PATH)

### See basic dataset info

In [6]:
df.shape

(100000, 37)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 100000 non-null  int64  
 1   gender                              100000 non-null  object 
 2   ethnicity                           100000 non-null  object 
 3   education_level                     100000 non-null  object 
 4   income_level                        100000 non-null  object 
 5   employment_status                   100000 non-null  object 
 6   smoking_status                      100000 non-null  object 
 7   alcohol_consumption_per_week        100000 non-null  int64  
 8   physical_activity_minutes_per_week  100000 non-null  int64  
 9   diet_score                          100000 non-null  float64
 10  sleep_hours_per_day                 100000 non-null  float64
 11  screen_time_hours_per_day  

## Clean and Save Data

### Dropping Columns

In [8]:
drop_these = [
    "diet_score",
    "age_group",
    "bmi_whr_group",
]

In [9]:
df.drop(columns=drop_these, inplace=True)

### Filling in NaN entries, if Any

In [10]:
df["alcohol_group"] = df["alcohol_group"].fillna("Light")

### Rearrange Columns

In [11]:
# Specify target column
target_col = "ENTER TARGET COLUMN HERE (e.g. Targets)"
target_col = "diagnosed_diabetes"

# Get all columns except target
cols = [col for col in df.columns if col != target_col]

# Sort columns alphabetically
sorted_cols = sorted(cols)

# Add target column at the end
final_cols = sorted_cols + [target_col]

# Rearrange DataFrame
df = df[final_cols]

### Get final column order

In [12]:
print(df.columns.to_list())

['abdominal_obesity', 'activity_level', 'age', 'alcohol_consumption_per_week', 'alcohol_group', 'bmi', 'bmi_group', 'cardiovascular_history', 'cholesterol_total', 'diabetes_risk_score', 'diabetes_stage', 'diastolic_bp', 'education_level', 'employment_status', 'ethnicity', 'family_history_diabetes', 'gender', 'glucose_fasting', 'glucose_postprandial', 'hba1c', 'hdl_cholesterol', 'heart_rate', 'hypertension_history', 'income_level', 'insulin_level', 'ldl_cholesterol', 'physical_activity_minutes_per_week', 'screen_time_hours_per_day', 'sleep_hours_per_day', 'smoking_status', 'systolic_bp', 'triglycerides', 'waist_to_hip_ratio', 'diagnosed_diabetes']


### SAVING CLEANED DATA TO FILE

In [13]:
df.to_csv(INTERMEDIATE_DATA_PATH, index=False)

## Load in the Clean Data File

In [14]:
df = pd.read_csv(INTERMEDIATE_DATA_PATH)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   abdominal_obesity                   100000 non-null  int64  
 1   activity_level                      100000 non-null  object 
 2   age                                 100000 non-null  int64  
 3   alcohol_consumption_per_week        100000 non-null  int64  
 4   alcohol_group                       100000 non-null  object 
 5   bmi                                 100000 non-null  float64
 6   bmi_group                           100000 non-null  object 
 7   cardiovascular_history              100000 non-null  int64  
 8   cholesterol_total                   100000 non-null  int64  
 9   diabetes_risk_score                 100000 non-null  float64
 10  diabetes_stage                      100000 non-null  object 
 11  diastolic_bp               

## Encoding the Data
Maps any string objects into data that the model can use (floats).

In [16]:
print(df.columns.to_list())

['abdominal_obesity', 'activity_level', 'age', 'alcohol_consumption_per_week', 'alcohol_group', 'bmi', 'bmi_group', 'cardiovascular_history', 'cholesterol_total', 'diabetes_risk_score', 'diabetes_stage', 'diastolic_bp', 'education_level', 'employment_status', 'ethnicity', 'family_history_diabetes', 'gender', 'glucose_fasting', 'glucose_postprandial', 'hba1c', 'hdl_cholesterol', 'heart_rate', 'hypertension_history', 'income_level', 'insulin_level', 'ldl_cholesterol', 'physical_activity_minutes_per_week', 'screen_time_hours_per_day', 'sleep_hours_per_day', 'smoking_status', 'systolic_bp', 'triglycerides', 'waist_to_hip_ratio', 'diagnosed_diabetes']


### Create Dictionaries for Encoding/Mapping Data

In [17]:
MAPPINGS = {}

### Examples

In [18]:
genders = sorted(df["gender"].unique().tolist())

In [19]:
gender_mapping = {gender: float(idx) for idx, gender in enumerate(genders)}

In [20]:
gender_mapping

{'Female': 0.0, 'Male': 1.0, 'Other': 2.0}

In [21]:
MAPPINGS["GENDER_MAPPING"] = gender_mapping

In [22]:
ethnicities = sorted(df["ethnicity"].unique().tolist())

In [23]:
ethnicity_mapping = {ethnicity: float(idx) for idx, ethnicity in enumerate(ethnicities)}

In [24]:
MAPPINGS["ETHNICITY_MAPPING"] = ethnicity_mapping

In [25]:
education_levels = sorted(df["education_level"].unique().tolist())

In [26]:
education_level_mapping = {
    education_level: float(idx) for idx, education_level in enumerate(education_levels)
}

In [27]:
MAPPINGS["EDUCATION_LEVEL_MAPPING"] = education_level_mapping

In [28]:
income_levels = sorted(df["income_level"].unique().tolist())

In [29]:
income_level_mapping = {income_level: float(idx) for idx, income_level in enumerate(income_levels)}

In [30]:
MAPPINGS["INCOME_LEVEL_MAPPING"] = income_level_mapping

In [31]:
employment_statuses = sorted(df["employment_status"].unique().tolist())

In [32]:
employment_status_mapping = {
    employment_status: float(idx) for idx, employment_status in enumerate(employment_statuses)
}

In [33]:
MAPPINGS["EMPLOYMENT_STATUS_MAPPING"] = employment_status_mapping

In [34]:
smoking_statuses = sorted(df["smoking_status"].unique().tolist())

In [35]:
smoking_status_mapping = {
    smoking_status: float(idx) for idx, smoking_status in enumerate(smoking_statuses)
}

In [36]:
MAPPINGS["SMOKING_STATUS_MAPPING"] = smoking_status_mapping

In [37]:
diabetes_stages = sorted(df["diabetes_stage"].unique().tolist())

In [38]:
diabetes_stage_mapping = {
    diabetes_stage: float(idx) for idx, diabetes_stage in enumerate(diabetes_stages)
}

In [39]:
MAPPINGS["DIABETES_STAGE_MAPPING"] = diabetes_stage_mapping

In [40]:
bmi_groups = sorted(df["bmi_group"].unique().tolist())

In [41]:
bmi_group_mapping = {bmi_group: float(idx) for idx, bmi_group in enumerate(bmi_groups)}

In [42]:
MAPPINGS["BMI_GROUP_MAPPING"] = bmi_group_mapping

In [43]:
activity_levels = sorted(df["activity_level"].unique().tolist())

In [44]:
activity_level_mapping = {
    activity_level: float(idx) for idx, activity_level in enumerate(activity_levels)
}

In [45]:
MAPPINGS["ACTIVITY_LEVEL_MAPPING"] = activity_level_mapping

In [46]:
alcohol_groups = sorted(df["alcohol_group"].unique().tolist())

In [47]:
alcohol_group_mapping = {
    alcohol_group: float(idx) for idx, alcohol_group in enumerate(alcohol_groups)
}

In [48]:
MAPPINGS["ALCOHOL_GROUP_MAPPING"] = alcohol_group_mapping

In [49]:
MAPPINGS

{'GENDER_MAPPING': {'Female': 0.0, 'Male': 1.0, 'Other': 2.0},
 'ETHNICITY_MAPPING': {'Asian': 0.0,
  'Black': 1.0,
  'Hispanic': 2.0,
  'Other': 3.0,
  'White': 4.0},
 'EDUCATION_LEVEL_MAPPING': {'Graduate': 0.0,
  'Highschool': 1.0,
  'No formal': 2.0,
  'Postgraduate': 3.0},
 'INCOME_LEVEL_MAPPING': {'High': 0.0,
  'Low': 1.0,
  'Lower-Middle': 2.0,
  'Middle': 3.0,
  'Upper-Middle': 4.0},
 'EMPLOYMENT_STATUS_MAPPING': {'Employed': 0.0,
  'Retired': 1.0,
  'Student': 2.0,
  'Unemployed': 3.0},
 'SMOKING_STATUS_MAPPING': {'Current': 0.0, 'Former': 1.0, 'Never': 2.0},
 'DIABETES_STAGE_MAPPING': {'Gestational': 0.0,
  'No Diabetes': 1.0,
  'Pre-Diabetes': 2.0,
  'Type 1': 3.0,
  'Type 2': 4.0},
 'BMI_GROUP_MAPPING': {'Normal': 0.0,
  'Obese': 1.0,
  'Overweight': 2.0,
  'Underweight': 3.0},
 'ACTIVITY_LEVEL_MAPPING': {'High': 0.0, 'Low': 1.0, 'Moderate': 2.0},
 'ALCOHOL_GROUP_MAPPING': {'Heavy': 0.0, 'Light': 1.0, 'Moderate': 2.0}}

### Store Mappings for later use

In [50]:
import json

In [51]:
os.makedirs(DATA_ROOT / MAPPING_DIR_NAME, exist_ok=True)  # Create the Data Splits Parent Directory

In [52]:
# Save to JSON file
with open(FEATURE_MAPPING_PATH, "w") as f:
    json.dump(MAPPINGS, f, indent=4)  # indent for readability

In [53]:
df.head()

Unnamed: 0,abdominal_obesity,activity_level,age,alcohol_consumption_per_week,alcohol_group,bmi,bmi_group,cardiovascular_history,cholesterol_total,diabetes_risk_score,...,insulin_level,ldl_cholesterol,physical_activity_minutes_per_week,screen_time_hours_per_day,sleep_hours_per_day,smoking_status,systolic_bp,triglycerides,waist_to_hip_ratio,diagnosed_diabetes
0,0,High,58,0,Light,30.5,Obese,0,239,29.6,...,6.36,160,215,7.9,7.9,Never,134,145,0.89,1
1,0,Moderate,48,1,Light,23.1,Normal,0,116,23.0,...,2.0,50,143,8.7,6.5,Former,129,30,0.8,0
2,0,Low,60,1,Light,22.2,Normal,0,213,44.7,...,5.07,99,57,8.1,10.0,Never,115,36,0.81,1
3,1,Low,74,0,Light,26.8,Overweight,0,171,38.2,...,5.28,79,49,5.2,6.6,Never,120,140,0.88,1
4,0,Moderate,46,1,Light,21.2,Normal,0,210,23.5,...,12.74,125,109,5.0,7.4,Never,92,160,0.78,1


### Apply Encoding/Mapping to the columns

In [54]:
df["gender"] = df["gender"].map(gender_mapping)
df["ethnicity"] = df["ethnicity"].map(ethnicity_mapping)
df["education_level"] = df["education_level"].map(education_level_mapping)
df["income_level"] = df["income_level"].map(income_level_mapping)
df["employment_status"] = df["employment_status"].map(employment_status_mapping)
df["smoking_status"] = df["smoking_status"].map(smoking_status_mapping)
df["diabetes_stage"] = df["diabetes_stage"].map(diabetes_stage_mapping)
df["bmi_group"] = df["bmi_group"].map(bmi_group_mapping)
df["activity_level"] = df["activity_level"].map(activity_level_mapping)
df["alcohol_group"] = df["alcohol_group"].map(alcohol_group_mapping)

### Check the new datatypes to make sure all string objects are converted

In [55]:
df.dtypes

abdominal_obesity                       int64
activity_level                        float64
age                                     int64
alcohol_consumption_per_week            int64
alcohol_group                         float64
bmi                                   float64
bmi_group                             float64
cardiovascular_history                  int64
cholesterol_total                       int64
diabetes_risk_score                   float64
diabetes_stage                        float64
diastolic_bp                            int64
education_level                       float64
employment_status                     float64
ethnicity                             float64
family_history_diabetes                 int64
gender                                float64
glucose_fasting                         int64
glucose_postprandial                    int64
hba1c                                 float64
hdl_cholesterol                         int64
heart_rate                        

In [56]:
# Show all columns without truncation
pd.set_option("display.max_columns", None)

# Optional: Prevent column width truncation
pd.set_option("display.max_colwidth", None)

In [57]:
df.describe(include="all")  # Show all columns with different types of data

Unnamed: 0,abdominal_obesity,activity_level,age,alcohol_consumption_per_week,alcohol_group,bmi,bmi_group,cardiovascular_history,cholesterol_total,diabetes_risk_score,diabetes_stage,diastolic_bp,education_level,employment_status,ethnicity,family_history_diabetes,gender,glucose_fasting,glucose_postprandial,hba1c,hdl_cholesterol,heart_rate,hypertension_history,income_level,insulin_level,ldl_cholesterol,physical_activity_minutes_per_week,screen_time_hours_per_day,sleep_hours_per_day,smoking_status,systolic_bp,triglycerides,waist_to_hip_ratio,diagnosed_diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.32618,1.07844,50.12041,2.00367,1.31435,25.612653,1.10188,0.0792,185.97811,30.222362,3.11133,75.23249,1.00007,0.69807,2.53327,0.21941,0.51797,111.11712,160.03505,6.520776,54.04279,69.63287,0.2508,2.5005,9.061242,103.00043,118.91164,5.996468,6.997818,1.39637,115.79961,121.46265,0.856078,0.59998
std,0.468816,0.797342,15.6046,1.417779,0.474021,3.586705,0.971818,0.270052,32.013005,9.061505,1.11717,8.20425,1.00013,1.024099,1.490126,0.413849,0.538461,13.59561,30.935472,0.813921,10.267374,8.371954,0.433476,1.115371,4.95406,33.390256,84.409662,2.468406,1.094622,0.80174,14.284073,43.372619,0.046837,0.489904
min,0.0,0.0,18.0,0.0,0.0,15.0,0.0,0.0,100.0,2.7,0.0,50.0,0.0,0.0,0.0,0.0,0.0,60.0,70.0,4.0,20.0,40.0,0.0,0.0,2.0,50.0,0.0,0.5,3.0,0.0,90.0,30.0,0.67,0.0
25%,0.0,0.0,39.0,1.0,1.0,23.2,0.0,0.0,164.0,23.8,2.0,70.0,0.0,0.0,1.0,0.0,0.0,102.0,139.0,5.97,47.0,64.0,0.0,2.0,5.09,78.0,57.0,4.3,6.3,1.0,106.0,91.0,0.82,0.0
50%,0.0,1.0,50.0,2.0,1.0,25.6,1.0,0.0,186.0,29.0,4.0,75.0,1.0,0.0,3.0,0.0,0.0,111.0,160.0,6.52,54.0,70.0,0.0,3.0,8.79,102.0,100.0,6.0,7.0,2.0,116.0,121.0,0.86,1.0
75%,1.0,2.0,61.0,3.0,2.0,28.0,2.0,0.0,208.0,35.6,4.0,81.0,1.0,1.0,4.0,0.0,1.0,120.0,181.0,7.07,61.0,75.0,1.0,3.0,12.45,126.0,160.0,7.7,7.7,2.0,125.0,151.0,0.89,1.0
max,1.0,2.0,90.0,10.0,2.0,39.2,3.0,1.0,318.0,67.2,4.0,110.0,3.0,3.0,4.0,1.0,2.0,172.0,287.0,9.8,98.0,105.0,1.0,4.0,32.22,263.0,833.0,16.8,10.0,2.0,179.0,344.0,1.06,1.0


In [58]:
df.head()

Unnamed: 0,abdominal_obesity,activity_level,age,alcohol_consumption_per_week,alcohol_group,bmi,bmi_group,cardiovascular_history,cholesterol_total,diabetes_risk_score,diabetes_stage,diastolic_bp,education_level,employment_status,ethnicity,family_history_diabetes,gender,glucose_fasting,glucose_postprandial,hba1c,hdl_cholesterol,heart_rate,hypertension_history,income_level,insulin_level,ldl_cholesterol,physical_activity_minutes_per_week,screen_time_hours_per_day,sleep_hours_per_day,smoking_status,systolic_bp,triglycerides,waist_to_hip_ratio,diagnosed_diabetes
0,0,0.0,58,0,1.0,30.5,1.0,0,239,29.6,4.0,78,1.0,0.0,0.0,0,1.0,136,236,8.18,41,68,0,2.0,6.36,160,215,7.9,7.9,2.0,134,145,0.89,1
1,0,2.0,48,1,1.0,23.1,0.0,0,116,23.0,1.0,76,1.0,0.0,4.0,0,0.0,93,150,5.63,55,67,0,3.0,2.0,50,143,8.7,6.5,1.0,129,30,0.8,0
2,0,1.0,60,1,1.0,22.2,0.0,0,213,44.7,4.0,73,1.0,3.0,2.0,1,1.0,118,195,7.51,66,74,0,3.0,5.07,99,57,8.1,10.0,2.0,115,36,0.81,1
3,1,1.0,74,0,1.0,26.8,2.0,0,171,38.2,4.0,93,1.0,1.0,1.0,0,0.0,139,253,9.03,50,68,0,1.0,5.28,79,49,5.2,6.6,2.0,120,140,0.88,1
4,0,2.0,46,1,1.0,21.2,0.0,0,210,23.5,4.0,67,0.0,1.0,4.0,0,1.0,137,184,7.2,52,67,0,3.0,12.74,125,109,5.0,7.4,2.0,92,160,0.78,1


In [59]:
type(df["gender"][0])

numpy.float64

### Convert all columns into Specific datatypes
* After conversion, it may produce NaNs. If so, try the whole process again. 

In [60]:
df = df.astype("float32")

### Print details of the Final Data Frame

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   abdominal_obesity                   100000 non-null  float32
 1   activity_level                      100000 non-null  float32
 2   age                                 100000 non-null  float32
 3   alcohol_consumption_per_week        100000 non-null  float32
 4   alcohol_group                       100000 non-null  float32
 5   bmi                                 100000 non-null  float32
 6   bmi_group                           100000 non-null  float32
 7   cardiovascular_history              100000 non-null  float32
 8   cholesterol_total                   100000 non-null  float32
 9   diabetes_risk_score                 100000 non-null  float32
 10  diabetes_stage                      100000 non-null  float32
 11  diastolic_bp               

### End of Data Mapping

In [62]:
df.head()

Unnamed: 0,abdominal_obesity,activity_level,age,alcohol_consumption_per_week,alcohol_group,bmi,bmi_group,cardiovascular_history,cholesterol_total,diabetes_risk_score,diabetes_stage,diastolic_bp,education_level,employment_status,ethnicity,family_history_diabetes,gender,glucose_fasting,glucose_postprandial,hba1c,hdl_cholesterol,heart_rate,hypertension_history,income_level,insulin_level,ldl_cholesterol,physical_activity_minutes_per_week,screen_time_hours_per_day,sleep_hours_per_day,smoking_status,systolic_bp,triglycerides,waist_to_hip_ratio,diagnosed_diabetes
0,0.0,0.0,58.0,0.0,1.0,30.5,1.0,0.0,239.0,29.6,4.0,78.0,1.0,0.0,0.0,0.0,1.0,136.0,236.0,8.18,41.0,68.0,0.0,2.0,6.36,160.0,215.0,7.9,7.9,2.0,134.0,145.0,0.89,1.0
1,0.0,2.0,48.0,1.0,1.0,23.1,0.0,0.0,116.0,23.0,1.0,76.0,1.0,0.0,4.0,0.0,0.0,93.0,150.0,5.63,55.0,67.0,0.0,3.0,2.0,50.0,143.0,8.7,6.5,1.0,129.0,30.0,0.8,0.0
2,0.0,1.0,60.0,1.0,1.0,22.200001,0.0,0.0,213.0,44.700001,4.0,73.0,1.0,3.0,2.0,1.0,1.0,118.0,195.0,7.51,66.0,74.0,0.0,3.0,5.07,99.0,57.0,8.1,10.0,2.0,115.0,36.0,0.81,1.0
3,1.0,1.0,74.0,0.0,1.0,26.799999,2.0,0.0,171.0,38.200001,4.0,93.0,1.0,1.0,1.0,0.0,0.0,139.0,253.0,9.03,50.0,68.0,0.0,1.0,5.28,79.0,49.0,5.2,6.6,2.0,120.0,140.0,0.88,1.0
4,0.0,2.0,46.0,1.0,1.0,21.200001,0.0,0.0,210.0,23.5,4.0,67.0,0.0,1.0,4.0,0.0,1.0,137.0,184.0,7.2,52.0,67.0,0.0,3.0,12.74,125.0,109.0,5.0,7.4,2.0,92.0,160.0,0.78,1.0


# SAVING FINAL DATA TO FILE

In [63]:
df.to_csv(DATA_PATH, index=False)

# Read and Test datatypes

In [64]:
dq = pd.read_csv(
    DATA_PATH, dtype="float32"
)  # Does not convert to float32 by default, dtype has to be explicitly provided

In [65]:
dq.head()

Unnamed: 0,abdominal_obesity,activity_level,age,alcohol_consumption_per_week,alcohol_group,bmi,bmi_group,cardiovascular_history,cholesterol_total,diabetes_risk_score,diabetes_stage,diastolic_bp,education_level,employment_status,ethnicity,family_history_diabetes,gender,glucose_fasting,glucose_postprandial,hba1c,hdl_cholesterol,heart_rate,hypertension_history,income_level,insulin_level,ldl_cholesterol,physical_activity_minutes_per_week,screen_time_hours_per_day,sleep_hours_per_day,smoking_status,systolic_bp,triglycerides,waist_to_hip_ratio,diagnosed_diabetes
0,0.0,0.0,58.0,0.0,1.0,30.5,1.0,0.0,239.0,29.6,4.0,78.0,1.0,0.0,0.0,0.0,1.0,136.0,236.0,8.18,41.0,68.0,0.0,2.0,6.36,160.0,215.0,7.9,7.9,2.0,134.0,145.0,0.89,1.0
1,0.0,2.0,48.0,1.0,1.0,23.1,0.0,0.0,116.0,23.0,1.0,76.0,1.0,0.0,4.0,0.0,0.0,93.0,150.0,5.63,55.0,67.0,0.0,3.0,2.0,50.0,143.0,8.7,6.5,1.0,129.0,30.0,0.8,0.0
2,0.0,1.0,60.0,1.0,1.0,22.200001,0.0,0.0,213.0,44.700001,4.0,73.0,1.0,3.0,2.0,1.0,1.0,118.0,195.0,7.51,66.0,74.0,0.0,3.0,5.07,99.0,57.0,8.1,10.0,2.0,115.0,36.0,0.81,1.0
3,1.0,1.0,74.0,0.0,1.0,26.799999,2.0,0.0,171.0,38.200001,4.0,93.0,1.0,1.0,1.0,0.0,0.0,139.0,253.0,9.03,50.0,68.0,0.0,1.0,5.28,79.0,49.0,5.2,6.6,2.0,120.0,140.0,0.88,1.0
4,0.0,2.0,46.0,1.0,1.0,21.200001,0.0,0.0,210.0,23.5,4.0,67.0,0.0,1.0,4.0,0.0,1.0,137.0,184.0,7.2,52.0,67.0,0.0,3.0,12.74,125.0,109.0,5.0,7.4,2.0,92.0,160.0,0.78,1.0


In [66]:
print(dq.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   abdominal_obesity                   100000 non-null  float32
 1   activity_level                      100000 non-null  float32
 2   age                                 100000 non-null  float32
 3   alcohol_consumption_per_week        100000 non-null  float32
 4   alcohol_group                       100000 non-null  float32
 5   bmi                                 100000 non-null  float32
 6   bmi_group                           100000 non-null  float32
 7   cardiovascular_history              100000 non-null  float32
 8   cholesterol_total                   100000 non-null  float32
 9   diabetes_risk_score                 100000 non-null  float32
 10  diabetes_stage                      100000 non-null  float32
 11  diastolic_bp               

# Creating DataLoaders

### Clean Data Function

In [67]:
def clean_data(
    df: pd.DataFrame,
    logger: Logger,
    extra_dropped_columns: Optional[List[str]] = None,
    show_dataframe_info=True,
) -> pd.DataFrame:
    """Cleans the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to be cleaned.
        logger (Logger): Logger object for logging information.
        extra_dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
        show_dataframe_info (bool): Flag to toggle logging DataFrame info.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    # Log the initial state of the DataFrame
    logger.info(f"Initial DataFrame shape: {df.shape}")

    if show_dataframe_info:
        buffer = io.StringIO()  # Create a buffer to capture the info output
        df.info(buf=buffer)  # Store the output into the buffer
        logger.info(f"Initial DataFrame info:\n " + buffer.getvalue())

    # Drop any unused columns
    try:
        df.drop(columns=extra_dropped_columns, inplace=True)
    except Exception as e:
        raise RuntimeError(f"Problem dropping columns:\n{e}")

    # Replacing any entry data
    df["alcohol_group"] = df["alcohol_group"].fillna("Light")

    # Create dictionaries for mapping/encoding

    # ================================
    # EXAMPLE PROCESS
    # ================================

    genders = sorted(df["gender"].unique().tolist())
    gender_mapping = {gender: float(idx) for idx, gender in enumerate(genders)}

    ethnicities = sorted(df["ethnicity"].unique().tolist())
    ethnicity_mapping = {ethnicity: float(idx) for idx, ethnicity in enumerate(ethnicities)}

    education_levels = sorted(df["education_level"].unique().tolist())
    education_level_mapping = {
        education_level: float(idx) for idx, education_level in enumerate(education_levels)
    }

    income_levels = sorted(df["income_level"].unique().tolist())
    income_level_mapping = {
        income_level: float(idx) for idx, income_level in enumerate(income_levels)
    }

    employment_statuses = sorted(df["employment_status"].unique().tolist())

    employment_status_mapping = {
        employment_status: float(idx) for idx, employment_status in enumerate(employment_statuses)
    }

    smoking_statuses = sorted(df["smoking_status"].unique().tolist())

    smoking_status_mapping = {
        smoking_status: float(idx) for idx, smoking_status in enumerate(smoking_statuses)
    }

    diabetes_stages = sorted(df["diabetes_stage"].unique().tolist())
    diabetes_stage_mapping = {
        diabetes_stage: float(idx) for idx, diabetes_stage in enumerate(diabetes_stages)
    }

    bmi_groups = sorted(df["bmi_group"].unique().tolist())
    bmi_group_mapping = {bmi_group: float(idx) for idx, bmi_group in enumerate(bmi_groups)}

    activity_levels = sorted(df["activity_level"].unique().tolist())
    activity_level_mapping = {
        activity_level: float(idx) for idx, activity_level in enumerate(activity_levels)
    }

    alcohol_groups = sorted(df["alcohol_group"].unique().tolist())
    alcohol_group_mapping = {
        alcohol_group: float(idx) for idx, alcohol_group in enumerate(alcohol_groups)
    }

    logger.info("Encoding categorical variables...")
    try:
        df["gender"] = df["gender"].map(gender_mapping)
        df["ethnicity"] = df["ethnicity"].map(ethnicity_mapping)
        df["education_level"] = df["education_level"].map(education_level_mapping)
        df["income_level"] = df["income_level"].map(income_level_mapping)
        df["employment_status"] = df["employment_status"].map(employment_status_mapping)
        df["smoking_status"] = df["smoking_status"].map(smoking_status_mapping)
        df["diabetes_stage"] = df["diabetes_stage"].map(diabetes_stage_mapping)
        df["bmi_group"] = df["bmi_group"].map(bmi_group_mapping)
        df["activity_level"] = df["activity_level"].map(activity_level_mapping)
        df["alcohol_group"] = df["alcohol_group"].map(alcohol_group_mapping)
    except Exception as e:
        logger.info(f"Problem encoding columns, {e}")

    # ================================
    # END OF MAPPING/ENCODING EXAMPLE
    # ================================

    # Handle missing values (if any)
    if df.isnull().sum().sum() > 0:
        logger.info("Handling missing values...")
        df = df.dropna()  # Example: Drop rows with missing values
        logger.info(f"DataFrame shape after dropping missing values: {df.shape}")

    # Convert to 'float32' to reduce memory usage
    logger.info("Converting Entire Data Frame to 'float32'...")
    df = df.astype("float32")

    if show_dataframe_info:
        # Reinitialize the buffer to clear any previous content in order to log the final dataframe info
        buffer = io.StringIO()
        df.info(buf=buffer)
        logger.info(f"Final DataFrame info:\n " + buffer.getvalue())

    return df

### Custom Dataset Class

In [68]:
class CustomDataset(Dataset):
    """Dataset class For the Custom Dataset"""

    def __init__(self, csv_file: str = "../Data/DataSplits/test.csv", label_column: str = "Label"):
        """Initializer for the Dataset class.

        Args:
            csv_file (str): Path to the CSV file containing the dataset.
            label_column (str): The name of the column indicating the label.
        """
        try:
            self.data = pd.read_csv(csv_file)  # Assign a pandas data frame
        except FileNotFoundError:  # Raise an error if the file is not found
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.label_column = label_column
        # Omit the label column to create the list of feature columns
        self.feature_columns = self.data.columns.drop([self.label_column])

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        """Returns a tuple (features, label) for the given index.

        Args:
            index (int): Index of the data sample to retrieve.

        Returns:
            tuple: (features, label) where features is a tensor of input features and label is the corresponding label.
        """
        # Use 'iloc' instead of 'loc' for efficiency
        features = self.data.iloc[index][self.feature_columns].values
        label = self.data.iloc[index][self.label_column]  # Extract the label for the given index
        return (torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long))

    def __len__(self) -> int:
        """Returns the amount of samples in the dataset."""
        return len(self.data)

### Data Pipeline Function

In [69]:
def data_pipeline(
    logger: Logger,
    dataset_url: str,
    root_data_dir: str = "../Data",
    data_file_path: str = "Dataset.csv",
    data_splits_dir: str = "DataSplits",
    scaler_dir="Scalers",
    target_column: str = "Target",
    use_label_scaler: bool = False,  # TOGGLE IF NEEDED
    extra_dropped_columns: Optional[List[str]] = None,
    batch_size: int = 64,
    num_workers: int = 0,
    pin_memory: bool = False,
    drop_last: bool = True,
) -> tuple[
    Dataset, Dataset, Dataset, DataLoader, DataLoader, DataLoader, MinMaxScaler, MinMaxScaler
]:
    """This function prepares the train, test, and validation datasets.

    Args:
        logger (Logger): The logger instance to log messages.
        dataset_url (str): The URL to download the dataset from, if not found locally.
        root_data_dir (str): The root of the Data Directory
        data_file_path (str): The name of the original dataset (with .csv file extension).
        data_splits_dir (str): Path to the train, test, and validation datasets.
        scaler_dir (str): Path to the feature and label scalers.
        use_label_scaler (bool): Dictates whether to use label scaler
        target_column (str): The name of the target column to predict.
        extra_dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
        batch_size (int): The dataloader's batch_size.
        num_workers (int): The dataloader's number of workers.
        pin_memory (bool): The dataloader's pin memory option.
        drop_last (bool): The dataloader's drop_last option.

    Returns:
        train_dataset (Dataset): Dataset Class for the training dataset.
        test_dataset (Dataset): Dataset Class for the test dataset.
        validation_dataset (Dataset): Dataset Class for the validation dataset.
        train_dataloader (DataLoader): The train dataloader.
        test_dataloader (DataLoader): The test dataloader.
        validation_dataloader (DataLoader): The validation dataloader.
        feature_scaler (MinMaxScaler): The scaler used to scale the features of the model input.
        label_scaler (MinMaxScaler): The scaler used to scale the labels of the model input.
    """
    if (
        not root_data_dir or not data_file_path or not data_splits_dir
    ):  # Check for empty strings at the beginning
        raise ValueError("File and directory paths cannot be empty strings.")
    DATA_ROOT = Path(root_data_dir)

    DATA_CLEAN_PATH = DATA_ROOT / data_file_path  # Set the path to the complete dataset

    if DATA_CLEAN_PATH.exists():
        logger.info(f"CSV file detected, reading from '{DATA_ROOT}'")
        df = pd.read_csv(
            DATA_CLEAN_PATH, dtype="float32"
        )  # Convert data to float32 instead of, float64
    else:
        logger.info(f"Downloading CSV file from '{dataset_url}'\nand saving into '{DATA_ROOT}'")
        try:
            os.makedirs(DATA_ROOT, exist_ok=True)  # Create the Data Root Directory
            # Download and read the data into a pandas dataframe
            df = pd.read_csv(dataset_url)  # Keep data as is, may not be able to expect float32 data

            # Clean the data before saving
            try:
                df = clean_data(df, logger, extra_dropped_columns=extra_dropped_columns)
            except Exception as e:
                raise RuntimeError(f"An unexpected error occurred cleaning the dataset:\n{e}")

            df.to_csv(DATA_CLEAN_PATH, index=False)  # Save the file, omitting saving the row index
        except OSError as e:
            raise RuntimeError(f"OS error occurred: {e}")
        except ParserError:
            raise RuntimeError(f"Failed to parse CSV from '{dataset_url}'")
        except ValueError as e:
            raise RuntimeError(f"Data cleaning error:\n{e}")
        except Exception as e:
            raise RuntimeError(
                f"An unexpected error occurred when downloading or saving the "
                f"dataset from '{dataset_url}' to '{DATA_CLEAN_PATH}':\n{e}"
            )

    # Define the paths for the data splits and scalers
    DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
    SCALER_DIR = DATA_ROOT / scaler_dir

    TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
    TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
    VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"

    FEATURE_SCALER_PATH = SCALER_DIR / "feature-scaler.joblib"
    LABEL_SCALER_PATH = SCALER_DIR / "label-scaler.joblib"

    # Define the columns to drop from the features
    columns_to_drop = [target_column]

    # Define the Data Splits
    TRAIN_SPLIT_PERCENTAGE = 0.9
    VALIDATION_SPLIT_PERCENTAGE = 0.5

    if (
        os.path.exists(TRAIN_DATA_PATH)
        and os.path.exists(TEST_DATA_PATH)
        and os.path.exists(VALIDATION_DATA_PATH)
    ):
        logger.info(
            f"Train, Test, and Validation CSV datasets detected in '{DATA_SPLITS_DIR}.' Skipping generation and loading scaler(s)"
        )
        try:
            feature_scaler = joblib.load(FEATURE_SCALER_PATH)
            logger.info(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            if use_label_scaler:
                joblib.dump(
                    label_scaler, LABEL_SCALER_PATH
                )  # Not used for this classification task
                logger.info(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
            else:
                label_scaler = None  # Omit the label scaler loading

        except FileNotFoundError as e:
            raise RuntimeError(f"Scaler file not found: {e}")
        except EOFError as e:
            raise RuntimeError(f"Scaler file appears to be empty or corrupted: {e}")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when loading scalers: {e}")
    else:
        logger.info(
            f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets..."
        )
        os.makedirs(DATA_SPLITS_DIR, exist_ok=True)  # Create the Data Splits Parent Directory
        os.makedirs(SCALER_DIR, exist_ok=True)  # Create the Scaler Parent Directory

        # Create the scaler objects
        feature_scaler = MinMaxScaler()
        if use_label_scaler:
            label_scaler = MinMaxScaler()
        else:
            label_scaler = None  # Not used for this Classification task

        try:
            df_features = df.drop(columns=columns_to_drop, inplace=False)
            df_labels = df[
                [target_column]
            ]  # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)
        except KeyError as e:
            raise KeyError(
                f"One or more specified columns to drop do not exist in the DataFrame: {e}"
            )

        # ================================
        # ADD OVERSAMPLING AND OTHER DATA BALANCING TECHNIQUES HERE
        # ================================

        # Example of using OverSampling Technique to Balance out the Dataset for an Unbalanced Dataset
        ros = RandomOverSampler(random_state=42)
        df_features_resampled, df_labels_resampled = ros.fit_resample(df_features, df_labels)

        # Split into smaller DataFrames for the Train, Test, and Validation splits
        X_train, X_inter, Y_train, Y_inter = train_test_split(
            df_features_resampled,
            df_labels_resampled,
            test_size=1 - TRAIN_SPLIT_PERCENTAGE,
            random_state=42,
        )

        # ================================
        # END  OF OVERSAMPLING AND OTHER DATA BALANCING TECHNIQUES ; OTHERWISE
        # ================================

        # Split into smaller DataFrames for the Train, Test, and Validation splits
        X_train, X_inter, Y_train, Y_inter = train_test_split(
            df_features,
            df_labels,
            test_size=1 - TRAIN_SPLIT_PERCENTAGE,
            random_state=42,
        )

        X_validation, X_test, Y_validation, Y_test = train_test_split(
            X_inter, Y_inter, test_size=1 - VALIDATION_SPLIT_PERCENTAGE, random_state=42
        )

        # Fit the scalers to the data
        feature_scaler.fit(X_train)
        # Only scale the labels if required
        if use_label_scaler:
            label_scaler.fit(Y_train)  # Not used for this Classification task

        # Save the fitted scaler object
        try:
            joblib.dump(feature_scaler, FEATURE_SCALER_PATH)
            logger.info(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            # Save the Label Scaler if utilized
            if use_label_scaler:
                joblib.dump(
                    label_scaler, LABEL_SCALER_PATH
                )  # Not used for this Classification task
                logger.info(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
        except FileNotFoundError as e:
            raise RuntimeError(f"Save path not found: {e}")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when saving  Scaler(s): {e}")

        # Scale all Feature Inputs
        X_train_scaled = feature_scaler.transform(X_train)
        X_validation_scaled = feature_scaler.transform(X_validation)
        X_test_scaled = feature_scaler.transform(X_test)

        if use_label_scaler:  # HANDLE EACH ON A CASE BY CASE BASIS
            Y_train = label_scaler.transform(Y_train)
            Y_validation = label_scaler.transform(Y_validation)
            Y_test = label_scaler.transform(Y_test)

        logger.info(f"Train Features (Scaled) Shape: {X_train_scaled.shape}")
        logger.info(f"Validation Features (Scaled) Shape: {X_validation_scaled.shape}")
        logger.info(f"Test Features (Scaled) Shape: {X_test_scaled.shape}")

        if use_label_scaler:
            logger.info(f"Train Labels (Scaled) Shape: {Y_train.shape}")
            logger.info(f"Validation Labels (Scaled) Shape: {Y_validation.shape}")
            logger.info(f"Test Labels (Scaled) Shape: {Y_test.shape}")
        else:
            logger.info(f"Train Labels Shape: {Y_train.shape}")
            logger.info(f"Validation Labels Shape: {Y_validation.shape}")
            logger.info(f"Test Labels Shape: {Y_test.shape}")

        # Define the column names of the features and label
        features_names = df_features.columns
        label_name = df_labels.columns

        # Create dataframes using the scaled data
        X_train_df = pd.DataFrame(X_train_scaled, columns=features_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=features_names)
        X_validation_df = pd.DataFrame(X_validation_scaled, columns=features_names)
        Y_train_df = pd.DataFrame(Y_train, columns=label_name)
        Y_test_df = pd.DataFrame(Y_test, columns=label_name)
        Y_validation_df = pd.DataFrame(Y_validation, columns=label_name)

        # Concatenate the features and labels back into a single DataFrame for each set
        train_data_frame = pd.concat([X_train_df, Y_train_df.reset_index(drop=True)], axis=1)
        test_data_frame = pd.concat([X_test_df, Y_test_df.reset_index(drop=True)], axis=1)
        validation_data_frame = pd.concat(
            [X_validation_df, Y_validation_df.reset_index(drop=True)], axis=1
        )

        # Saving the split data to csv files
        try:
            train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
            test_data_frame.to_csv(TEST_DATA_PATH, index=False)
            validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)
        except FileNotFoundError as e:
            raise RuntimeError(f"Save path not found: {e}")
        except Exception as e:
            raise RuntimeError(
                f"An unexpected error occurred when saving datasets to CSV files:\n{e}"
            )

    # Creating Datasets from the stored datasets
    logger.info(f"INITIALIZING DATASETS")
    train_dataset = CustomDataset(csv_file=TRAIN_DATA_PATH, label_column=target_column)
    test_dataset = CustomDataset(csv_file=TEST_DATA_PATH, label_column=target_column)
    val_dataset = CustomDataset(csv_file=VALIDATION_DATA_PATH, label_column=target_column)

    logger.info(
        f"Creating DataLoaders with 'batch_size'=({batch_size}), 'num_workers'=({num_workers}), 'pin_memory'=({pin_memory}). Training dataset 'drop_last'=({drop_last})"
    )
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=True,
    )
    validation_dataloader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=False,
    )
    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=False,
    )

    logger.info(
        f"Training DataLoader has ({len(train_dataloader)}) batches, Test DataLoader has ({len(test_dataloader)}) batches, Validation DataLoader has ({len(validation_dataloader)}) batches"
    )

    logger.info("==================================================================")
    for name, dataloader in [
        ("Train", train_dataloader),
        ("Validation", validation_dataloader),
        ("Test", test_dataloader),
    ]:
        features, labels = next(iter(dataloader))  # Get one batch

        logger.info(f"{name} Dataloader Batch Information")
        logger.info(f"Features Shape: '{features.shape}' |  DataTypes: '{features.dtype}'")
        logger.info(f"Labels Shape: '{labels.shape}'   |  DataTypes: '{labels.dtype}' ")
        logger.info("==================================================================")

    return (
        train_dataset,
        test_dataset,
        val_dataset,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        feature_scaler,
        label_scaler,
    )

# Testing the Data Pipeline

## Testing with a given URL

- Edit the Python dictionary 'data' section

In [70]:
# USED WHEN TESTING THE RAW DATASET
def test_data_pipeline():
    # Function input setup
    data = {
        "dataset_url": "hf://datasets/MaxPrestige/Synthetic-Diabetes-Dataset/Data/Synthetic-Diabetes-Dataset.csv",
        "root_data_dir": "../Data",
        "data_file_path": DATA_CLEAN_FILE_NAME,
        "data_splits_dir": "DataSplits",
        "scaler_dir": "Scalers",
        "target_column": "diagnosed_diabetes",
        "extra_dropped_columns": [
            # REPLACE WITH ANY COLUMNS TO BE EXCLUDED FROM THE DATASET - COMMA SEPARATED
        ],
    }
    batch_size = 64
    num_workers = 0
    pin_memory = False
    drop_last = True

    logger = logging.getLogger(__name__)

    # Call the data pipeline function
    try:
        (
            train_dataset,
            test_dataset,
            val_dataset,
            train_dataloader,
            test_dataloader,
            validation_dataloader,
            feature_scaler,
            label_scaler,
        ) = data_pipeline(
            logger,
            **data,
            batch_size=batch_size,
            num_workers=num_workers,
            pin_memory=pin_memory,
            drop_last=drop_last,
        )
    except Exception as e:
        logger.info(f"Caught Exception: {e}", stack_info=True)

    # Basic assertions to verify the outputs
    assert isinstance(train_dataset, Dataset), "train_dataset is not an instance of Dataset"
    assert isinstance(test_dataset, Dataset), "test_dataset is not an instance of Dataset"
    assert isinstance(val_dataset, Dataset), "val_dataset is not an instance of Dataset"
    assert isinstance(
        train_dataloader, DataLoader
    ), "train_dataloader is not an instance of DataLoader"
    assert isinstance(
        test_dataloader, DataLoader
    ), "test_dataloader is not an instance of DataLoader"
    assert isinstance(
        validation_dataloader, DataLoader
    ), "validation_dataloader is not an instance of DataLoader"
    assert isinstance(
        feature_scaler, MinMaxScaler
    ), "feature_scaler is not an instance of MinMaxScaler"
    # assert isinstance(label_scaler, MinMaxScaler), "label_scaler is not an instance of MinMaxScaler"

    logger.info("All assertions passed. Data pipeline test successful.")

    return (
        train_dataset,
        test_dataset,
        val_dataset,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        feature_scaler,
        label_scaler,
    )

### Call the 'test_data_pipeline' function and capture the return variables

In [71]:
(
    train_dataset,
    test_dataset,
    val_dataset,
    train_dataloader,
    test_dataloader,
    validation_dataloader,
    feature_scaler,
    label_scaler,
) = test_data_pipeline()

INFO:__main__:Downloading CSV file from 'hf://datasets/MaxPrestige/Synthetic-Diabetes-Dataset/Data/Synthetic-Diabetes-Dataset.csv'
and saving into '..\Data'
  from .autonotebook import tqdm as notebook_tqdm
INFO:__main__:Initial DataFrame shape: (100000, 34)
INFO:__main__:Initial DataFrame info:
 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   abdominal_obesity                   100000 non-null  int64  
 1   activity_level                      100000 non-null  object 
 2   age                                 100000 non-null  int64  
 3   alcohol_consumption_per_week        100000 non-null  int64  
 4   alcohol_group                       100000 non-null  object 
 5   bmi                                 100000 non-null  float64
 6   bmi_group                           100000 non-null  object

### Verify the length of the dataloader(s)

In [72]:
len(validation_dataloader)

78

### See details about a batch of each dataloader 

In [73]:
logger.info("==================================================================")
for name, dataloader in [
    ("Train", train_dataloader),
    ("Validation", validation_dataloader),
    ("Test", test_dataloader),
]:
    features, labels = next(iter(dataloader))  # Get one batch

    logger.info(f"{name} Dataloader Batch Information")
    logger.info(f"Features Shape: '{features.shape}' |  DataTypes: '{features.dtype}'")
    logger.info(f"Labels Shape: '{labels.shape}'   |  DataTypes: '{labels.dtype}' ")
    logger.info(f"The labels: {labels}")  # Optional
    logger.info("==================================================================")

INFO:__main__:Train Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 33])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
        0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0])
INFO:__main__:Validation Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 33])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0])
INFO:__main__:Test Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size

# End