In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/analyzed_data.csv')

In [3]:
df.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,light_h9,light_w1,light_w2,light_w3,light_w4,light_w5,light_w6,light_w7,age_internet,avg_fitness_score
0,00115b9f,Winter,9.0,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,13.703326,15.487296,79.80259,30.175367,25.983143,33.926468,11.132876,87.537865,0.0,0.828571
1,001f3379,Spring,13.0,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,57.52293,9.958206,46.1462,14.388186,49.01268,39.446934,20.296366,5.850941,0.0,0.857143
2,00f332d1,Winter,14.0,0,Spring,68.0,Spring,17.168636,66.5,108.0,...,108.58199,40.313942,45.987415,50.706913,41.632717,40.377525,228.70036,17.050716,28.0,0.714286
3,01085eb3,Fall,12.0,0,Winter,58.0,Fall,34.187282,60.5,178.0,...,25.183878,22.056694,21.37933,15.844161,17.851442,12.499741,12.259155,26.816135,0.0,0.714286
4,012cadd8,Spring,9.0,0,Winter,60.0,Fall,17.089151,53.2,68.8,...,34.599117,14.140425,48.232033,5.0,74.60526,114.41666,77.441864,21.184536,0.0,1.028571


In [4]:
df['sii_2']

0      1.0
1      1.0
2      1.0
3      0.0
4      0.0
      ... 
972    0.0
973    0.0
974    0.0
975    1.0
976    0.0
Name: sii_2, Length: 977, dtype: float64

In [5]:
numerical_features = [col for col in df.columns if df[col].dtype in ['float64', 'int64'] and col not in ['id', '', 'sii', 'sii_2', 'age_group']]
categorical_features = ['Basic_Demos-Sex', 'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'age_group']

In [6]:
numerical_features, 

(['Basic_Demos-Age',
  'Basic_Demos-Sex',
  'CGAS-CGAS_Score',
  'Physical-BMI',
  'Physical-Height',
  'Physical-Weight',
  'Physical-Diastolic_BP',
  'Physical-HeartRate',
  'Physical-Systolic_BP',
  'Fitness_Endurance-Max_Stage',
  'Fitness_Endurance-Time_Mins',
  'Fitness_Endurance-Time_Sec',
  'FGC-FGC_CU',
  'FGC-FGC_CU_Zone',
  'FGC-FGC_GSND',
  'FGC-FGC_GSND_Zone',
  'FGC-FGC_GSD',
  'FGC-FGC_GSD_Zone',
  'FGC-FGC_PU',
  'FGC-FGC_PU_Zone',
  'FGC-FGC_SRL',
  'FGC-FGC_SRL_Zone',
  'FGC-FGC_SRR',
  'FGC-FGC_SRR_Zone',
  'FGC-FGC_TL',
  'FGC-FGC_TL_Zone',
  'BIA-BIA_Activity_Level_num',
  'BIA-BIA_BMC',
  'BIA-BIA_BMI',
  'BIA-BIA_BMR',
  'BIA-BIA_DEE',
  'BIA-BIA_ECW',
  'BIA-BIA_FFM',
  'BIA-BIA_FFMI',
  'BIA-BIA_FMI',
  'BIA-BIA_Fat',
  'BIA-BIA_Frame_num',
  'BIA-BIA_ICW',
  'BIA-BIA_LDM',
  'BIA-BIA_LST',
  'BIA-BIA_SMM',
  'BIA-BIA_TBW',
  'PAQ_C-PAQ_C_Total',
  'PCIAT-PCIAT_01',
  'PCIAT-PCIAT_02',
  'PCIAT-PCIAT_03',
  'PCIAT-PCIAT_04',
  'PCIAT-PCIAT_05',
  'PCIAT-PCIAT_0

In [7]:
X = df.drop(['id', 'sii', 'sii_2'], axis=1)
sii_y = df['sii']
sii_2_y = df['sii_2']

X.shape, sii_y.shape, sii_2_y.shape

((977, 142), (977,), (977,))

In [8]:
y = sii_2_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def convert_to_dataset(X, y):
    # Convert features to text format
    texts = []
    for _, row in X.iterrows():
        text = " ".join([f"{col}: {val}" for col, val in row.items()])
        texts.append(text)
    
    # Create dictionary for Dataset
    dataset_dict = {
        "text": texts,
        "label": y.tolist()
    }
    
    return Dataset.from_dict(dataset_dict)

In [21]:
train_dataset = convert_to_dataset(X_train, y_train.astype(int))
test_dataset = convert_to_dataset(X_test, y_test.astype(int))

In [22]:
train_dataset[0]

{'text': 'Basic_Demos-Enroll_Season: Winter Basic_Demos-Age: 9.0 Basic_Demos-Sex: 0 CGAS-Season: Spring CGAS-CGAS_Score: 65.0 Physical-Season: Winter Physical-BMI: 16.94732143 Physical-Height: 56.0 Physical-Weight: 75.6 Physical-Diastolic_BP: 63.0 Physical-HeartRate: 78.0 Physical-Systolic_BP: 106.0 Fitness_Endurance-Season: Winter Fitness_Endurance-Max_Stage: 2.0 Fitness_Endurance-Time_Mins: 2.0 Fitness_Endurance-Time_Sec: 9.0 FGC-Season: Winter FGC-FGC_CU: 0.0 FGC-FGC_CU_Zone: 0.0 FGC-FGC_GSND: 18.62 FGC-FGC_GSND_Zone: 1.6 FGC-FGC_GSD: 19.66 FGC-FGC_GSD_Zone: 1.8 FGC-FGC_PU: 5.0 FGC-FGC_PU_Zone: 0.0 FGC-FGC_SRL: 6.0 FGC-FGC_SRL_Zone: 0.0 FGC-FGC_SRR: 4.5 FGC-FGC_SRR_Zone: 0.0 FGC-FGC_TL: 4.5 FGC-FGC_TL_Zone: 0.0 BIA-Season: Spring BIA-BIA_Activity_Level_num: 3.0 BIA-BIA_BMC: 8.16945 BIA-BIA_BMI: 16.9492 BIA-BIA_BMR: 3115.72 BIA-BIA_DEE: 5296.72 BIA-BIA_ECW: 86.4271 BIA-BIA_FFM: 274.128 BIA-BIA_FFMI: 61.4583 BIA-BIA_FMI: -44.5091 BIA-BIA_Fat: -198.528 BIA-BIA_Frame_num: 1.0 BIA-BIA_IC

In [23]:
model_name = "distilbert-base-uncased"  # You can change to a different model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(y.unique()))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [25]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 781/781 [00:00<00:00, 1562.33 examples/s]
Map: 100%|██████████| 196/196 [00:00<00:00, 1539.54 examples/s]


In [26]:
tokenized_train[0]

{'text': 'Basic_Demos-Enroll_Season: Winter Basic_Demos-Age: 9.0 Basic_Demos-Sex: 0 CGAS-Season: Spring CGAS-CGAS_Score: 65.0 Physical-Season: Winter Physical-BMI: 16.94732143 Physical-Height: 56.0 Physical-Weight: 75.6 Physical-Diastolic_BP: 63.0 Physical-HeartRate: 78.0 Physical-Systolic_BP: 106.0 Fitness_Endurance-Season: Winter Fitness_Endurance-Max_Stage: 2.0 Fitness_Endurance-Time_Mins: 2.0 Fitness_Endurance-Time_Sec: 9.0 FGC-Season: Winter FGC-FGC_CU: 0.0 FGC-FGC_CU_Zone: 0.0 FGC-FGC_GSND: 18.62 FGC-FGC_GSND_Zone: 1.6 FGC-FGC_GSD: 19.66 FGC-FGC_GSD_Zone: 1.8 FGC-FGC_PU: 5.0 FGC-FGC_PU_Zone: 0.0 FGC-FGC_SRL: 6.0 FGC-FGC_SRL_Zone: 0.0 FGC-FGC_SRR: 4.5 FGC-FGC_SRR_Zone: 0.0 FGC-FGC_TL: 4.5 FGC-FGC_TL_Zone: 0.0 BIA-Season: Spring BIA-BIA_Activity_Level_num: 3.0 BIA-BIA_BMC: 8.16945 BIA-BIA_BMI: 16.9492 BIA-BIA_BMR: 3115.72 BIA-BIA_DEE: 5296.72 BIA-BIA_ECW: 86.4271 BIA-BIA_FFM: 274.128 BIA-BIA_FFMI: 61.4583 BIA-BIA_FMI: -44.5091 BIA-BIA_Fat: -198.528 BIA-BIA_Frame_num: 1.0 BIA-BIA_IC

In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.68187
2,No log,0.679925
3,No log,0.680052


TrainOutput(global_step=294, training_loss=0.6828169530751754, metrics={'train_runtime': 2403.8992, 'train_samples_per_second': 0.975, 'train_steps_per_second': 0.122, 'total_flos': 310371115051008.0, 'train_loss': 0.6828169530751754, 'epoch': 3.0})