In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from torch import nn
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import pytorch_optimizer as optim1


from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl
from torchmetrics.classification import F1Score, MulticlassCohenKappa, Accuracy

In [69]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "synthetic_coffee_health_10000.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "uom190346a/global-coffee-health-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)
df.set_index('ID', inplace=True)
df.drop(columns=['Health_Issues'], inplace=True) 
print("First 5 records:", df.head())

First 5 records:     Age  Gender  Country  Coffee_Intake  Caffeine_mg  Sleep_Hours  \
ID                                                                  
1    40    Male  Germany            3.5        328.1          7.5   
2    33    Male  Germany            1.0         94.1          6.2   
3    42    Male   Brazil            5.3        503.7          5.9   
4    53    Male  Germany            2.6        249.2          7.3   
5    32  Female    Spain            3.1        298.0          5.3   

   Sleep_Quality   BMI  Heart_Rate Stress_Level  Physical_Activity_Hours  \
ID                                                                         
1           Good  24.9          78          Low                     14.5   
2           Good  20.0          67          Low                     11.0   
3           Fair  22.7          59       Medium                     11.2   
4           Good  24.7          71          Low                      6.6   
5           Fair  24.1          76       Me

In [70]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 1 to 10000
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      10000 non-null  int64  
 1   Gender                   10000 non-null  object 
 2   Country                  10000 non-null  object 
 3   Coffee_Intake            10000 non-null  float64
 4   Caffeine_mg              10000 non-null  float64
 5   Sleep_Hours              10000 non-null  float64
 6   Sleep_Quality            10000 non-null  object 
 7   BMI                      10000 non-null  float64
 8   Heart_Rate               10000 non-null  int64  
 9   Stress_Level             10000 non-null  object 
 10  Physical_Activity_Hours  10000 non-null  float64
 11  Occupation               10000 non-null  object 
 12  Smoking                  10000 non-null  int64  
 13  Alcohol_Consumption      10000 non-null  int64  
dtypes: float64(5), int64(4), ob

Unnamed: 0_level_0,Age,Gender,Country,Coffee_Intake,Caffeine_mg,Sleep_Hours,Sleep_Quality,BMI,Heart_Rate,Stress_Level,Physical_Activity_Hours,Occupation,Smoking,Alcohol_Consumption
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,40,Male,Germany,3.5,328.1,7.5,Good,24.9,78,Low,14.5,Other,0,0
2,33,Male,Germany,1.0,94.1,6.2,Good,20.0,67,Low,11.0,Service,0,0
3,42,Male,Brazil,5.3,503.7,5.9,Fair,22.7,59,Medium,11.2,Office,0,0
4,53,Male,Germany,2.6,249.2,7.3,Good,24.7,71,Low,6.6,Other,0,0
5,32,Female,Spain,3.1,298.0,5.3,Fair,24.1,76,Medium,8.5,Student,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,50,Female,Japan,2.1,199.8,6.0,Fair,30.5,50,Medium,10.1,Healthcare,0,1
9997,18,Female,UK,3.4,319.2,5.8,Fair,19.1,71,Medium,11.6,Service,0,0
9998,26,Male,China,1.6,153.4,7.1,Good,25.1,66,Low,13.7,Student,1,1
9999,40,Female,Finland,3.4,327.1,7.0,Good,19.3,80,Low,0.1,Student,0,0


In [71]:
df.Stress_Level.value_counts()

Stress_Level
Low       6989
Medium    2050
High       961
Name: count, dtype: int64

In [72]:
X = df.drop(columns=['Stress_Level'], axis=1)
y = df['Stress_Level']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train

Unnamed: 0_level_0,Age,Gender,Country,Coffee_Intake,Caffeine_mg,Sleep_Hours,Sleep_Quality,BMI,Heart_Rate,Physical_Activity_Hours,Occupation,Smoking,Alcohol_Consumption
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6190,50,Female,USA,2.2,205.2,6.0,Fair,21.5,55,10.7,Service,0,0
5050,32,Male,Italy,0.0,0.0,9.4,Excellent,16.4,81,7.0,Other,0,0
2574,34,Female,Brazil,1.4,134.5,6.8,Good,20.1,60,4.7,Student,0,0
1052,39,Male,Sweden,1.5,144.3,8.5,Excellent,17.2,66,1.6,Student,1,1
4400,46,Female,Switzerland,3.2,308.5,5.7,Fair,25.4,76,2.8,Student,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8633,53,Female,India,1.7,157.0,6.0,Fair,28.2,81,10.3,Service,0,1
5394,36,Female,Australia,2.5,233.6,5.1,Fair,28.1,53,11.2,Healthcare,0,0
4971,53,Male,Belgium,4.0,376.2,5.8,Fair,21.5,68,11.8,Other,0,0
5561,22,Female,Netherlands,2.5,233.6,6.5,Good,31.4,65,14.5,Service,0,1


In [73]:
num = X.select_dtypes(include=['int64', 'float64']).columns
cat = X.select_dtypes(include=['object', 'category']).columns
num, cat

(Index(['Age', 'Coffee_Intake', 'Caffeine_mg', 'Sleep_Hours', 'BMI',
        'Heart_Rate', 'Physical_Activity_Hours', 'Smoking',
        'Alcohol_Consumption'],
       dtype='object'),
 Index(['Gender', 'Country', 'Sleep_Quality', 'Occupation'], dtype='object'))

In [74]:
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipe, X_train.select_dtypes(include=['int64', 'float64']).columns),
    ('cat', cat_pipe, X_train.select_dtypes(include=['object', 'category']).columns)
])

In [75]:
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)


In [76]:
map = lambda s: {'Low': 0, 'Medium': 1, 'High': 2}[s]
y_train = y_train.map(map).values
y_val = y_val.map(map).values
y_train, y_val

(array([1, 0, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 2, 0, ..., 0, 2, 1], dtype=int64))

# Dataset and Dataloader

In [77]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

In [78]:
train_set = TensorDataset(X_train, y_train)
val_set = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32)

In [79]:
feature, target = next(iter(train_loader))
print(feature.shape, target.shape)

torch.Size([32, 41]) torch.Size([32])


# NN

In [80]:
class NN(nn.Module):
    def __init__(self, input_size) -> None:
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.Mish(),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.Mish(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.Mish(),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.Mish(),
            nn.Dropout(0.2),

            nn.Linear(16, 8),
            nn.BatchNorm1d(8),
            nn.Mish(),
            nn.Dropout(0.2),

            nn.Linear(8, 3)
        )

    def forward(self, X):
        return self.fc(X)

In [81]:
class PLNN(LightningModule):
    def __init__(self, input_size, num_classes=4, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters() 
        self.model = NN(input_size=input_size)
        self.criterion = nn.CrossEntropyLoss()
        self.kappa = MulticlassCohenKappa(num_classes=num_classes, weights='quadratic')
        self.f1 = F1Score(task='multiclass', num_classes=num_classes, average='weighted')
        self.accurasy = Accuracy(task='multiclass', num_classes=num_classes)
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs) 
        loss = self.criterion(outputs, labels)
        acc = self.accurasy(outputs, labels)
        f1 = self.accurasy(outputs, labels)
        kappa = self.accurasy(outputs, labels)

        
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        self.log('train_f1', f1, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_epoch=True, prog_bar=True)
        self.log('train_kappa', kappa, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        acc = self.accurasy(outputs, labels)
        f1 = self.accurasy(outputs, labels)
        kappa = self.accurasy(outputs, labels)


        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_f1', f1, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        self.log('val_kappa', kappa, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim1.AdamW(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer

# Config

In [82]:
if torch.cuda.is_available():
    accelerator_type = 'gpu'
    devices_to_use = 1
else:
    accelerator_type = 'cpu'
    devices_to_use = 'auto'

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints_Strees_level/',
    filename='bodyP-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    mode='min',
)
lr_monitor_callback = LearningRateMonitor(logging_interval='epoch')

trainer1 = pl.Trainer(
    max_epochs=300,
    callbacks=[checkpoint_callback, early_stopping, lr_monitor_callback],
    logger=TensorBoardLogger("tb_logs_Stress_level", name="simple_model_experiment"),
    accelerator=accelerator_type,
    devices=devices_to_use,
    log_every_n_steps=10,
    deterministic=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


# train

In [83]:
model = PLNN(input_size=X_train.shape[1], num_classes=3, learning_rate=1e-3)
trainer1.fit(model, train_loader, val_loader)


  | Name      | Type                 | Params | Mode 
-----------------------------------------------------------
0 | model     | NN                   | 16.6 K | train
1 | criterion | CrossEntropyLoss     | 0      | train
2 | kappa     | MulticlassCohenKappa | 0      | train
3 | f1        | MulticlassF1Score    | 0      | train
4 | accurasy  | MulticlassAccuracy   | 0      | train
-----------------------------------------------------------
16.6 K    Trainable params
0         Non-trainable params
16.6 K    Total params
0.067     Total estimated model params size (MB)
25        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

# eval

In [84]:
trainer1.validate(model, val_loader, ckpt_path='best')

Restoring states from the checkpoint path at C:\Users\dsapu\Project\Python\repo\dlp\globalcaffesyntetic\checkpoints_Strees_level\bodyP-epoch=53-val_loss=0.00.ckpt
Loaded model weights from the checkpoint at C:\Users\dsapu\Project\Python\repo\dlp\globalcaffesyntetic\checkpoints_Strees_level\bodyP-epoch=53-val_loss=0.00.ckpt


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 6.616094765377056e-07,
  'val_f1': 1.0,
  'val_acc': 1.0,
  'val_kappa': 1.0}]