In [None]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import seaborn as sns 
import numpy as np 
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction
#imports all of the libraries needed 

In [31]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"PyTorch is using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU")
#sanity check on what my computer is using. 

PyTorch is using GPU: NVIDIA GeForce GTX 1060 6GB


## Reading Data

In [6]:
train_file_Path = 'drugLibTrain_raw.tsv'
test_file_Path = 'drugLibTest_raw.tsv'
drug_train_df = pd.read_csv(train_file_Path, sep = '\t')
drug_test_df = pd.read_csv(test_file_Path,sep = '\t')

In [33]:
drug_train_df['benefitsReview'] = drug_train_df['benefitsReview'].fillna('')
drug_test_df['benefitsReview'] = drug_test_df['benefitsReview'].fillna('')

In [None]:
#ratings from the data is from 1-10, i'm just changing them into sentiments of positive(2) negative(0) and neutral(1)
def turn_to_sentiment(ratings):
    if ratings >= 8:
        return 2
    elif ratings <= 3:
        return 0
    else:
        return 1

In [34]:
drug_train_df['sentiment_label'] = drug_train_df['rating'].apply(turn_to_sentiment)
drug_test_df['sentiment_label'] = drug_test_df['rating'].apply(turn_to_sentiment)

## Paths for things

In [10]:
model_Path = 'dmis-lab/biobert-v1.1' 
finetune_output = "./sentiment_finetuning_cv"
final_model_output = "./final_sentiment_model"
label_column = 'sentiment'
num_unique_Labels = 3
labels = ['negative','neutral','positive']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
# update the reviews (side effect or benefits)
type_review = 'benefitsReview'

## Functions for modeling and tokenizing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_Path)
def get_model(): 
    model= AutoModelForSequenceClassification.from_pretrained(
        model_Path,
        num_labels = num_unique_Labels,
        id2label = id2label,
        label2id =label2id
    )
    return model.to(device)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize(review):
    list_form = review.tolist() #turns my column into a list form
    return tokenizer(
    list_form,
    max_length=256, #this value can be changed. I'm only using half of 512 so my simulation doesnt take longer
    truncation=True,             
    padding="max_length",        
    return_tensors="pt"         
)

### Training Arguments Setup

In [None]:
#these values can be adjusted if there is time to run more CV loops for better understanding such as more epochs changing Learning rate etc
LR = 2e-5
epoch_Numb = 2
batch_size = 16

training_Args = TrainingArguments(
output_dir=finetune_output,
num_train_epochs=epoch_Numb,
learning_rate=LR,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=100,
weight_decay=0.01,
logging_steps=50,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_weighted",
save_total_limit=1,
)

### Metrics Function

In [None]:
def compute_classification_metrics(eval_pred: EvalPrediction):
    predictions, label_ids = eval_pred
    predicted_ids = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(y_true=label_ids, y_pred=predicted_ids)
    precision, recall, f1, _ = precision_recall_fscore_support( #relying on sklearn for calculating the metrics
        y_true=label_ids, y_pred=predicted_ids, average='weighted', zero_division=0
    )
    return {
        'accuracy': accuracy,
        'f1_weighted': f1,
        'precision_weighted': precision,
        'recall_weighted': recall
    }

##### just testing the above function

##### HF Dataset

In [None]:
tokenized_Benefits = tokenize(drug_train_df['benefitsReview'])

### HF Training Benefits Dataset

In [29]:
#these will be used after the CV
drug_train_df_sentiment_Label = drug_train_df['sentiment_label']
review_train_Dict = {"input_ids" : tokenized_Benefits['input_ids'], "token_type_ids": tokenized_Benefits['token_type_ids'],"attention_mask": tokenized_Benefits['attention_mask'], "labels": torch.tensor(drug_train_df_sentiment_Label)}
review_train_dataset = Dataset.from_dict(review_train_Dict)

## Cross-Validation

In [17]:
N_SPLITS = 5 
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
all_fold_metrics = []
y_split_labels = drug_train_df_sentiment_Label

In [18]:
drug_train_benefits_df = drug_train_df[['benefitsReview','sentiment_label']].copy()

In [32]:
for fold_num, (train_idx, val_idx) in enumerate(skf.split(drug_train_benefits_df['benefitsReview'], y_split_labels)):
    print(f"\n===== Starting Fold {fold_num + 1}/{N_SPLITS} =====")
    df_train_fold = drug_train_benefits_df.iloc[train_idx]
    df_val_fold = drug_train_benefits_df.iloc[val_idx]
    #tokenize the reviews
    print(f"Tokenizing training data for fold {fold_num + 1}...")
    tokenized_train_inputs = tokenize(df_train_fold['benefitsReview'])
    print(f"Tokenizing validation data for fold {fold_num + 1}...")
    tokenized_val_inputs = tokenize(df_val_fold['benefitsReview'])

    #labels
    train_labels_fold = torch.tensor(df_train_fold['sentiment_label'].tolist())
    val_labels_fold = torch.tensor(df_val_fold['sentiment_label'].tolist())

    #turning my tokenized data into hugging face datasets
    train_data_dict = {
        'input_ids': tokenized_train_inputs['input_ids'],
        'attention_mask': tokenized_train_inputs['attention_mask'],
        'token_type_ids': tokenized_train_inputs['token_type_ids'],
        'labels': train_labels_fold
    }

    train_dataset_fold = Dataset.from_dict(train_data_dict)
    
    #turning my tokenized data into hugging face datasets
    val_data_dict = {
        'input_ids': tokenized_val_inputs['input_ids'],
        'attention_mask': tokenized_val_inputs['attention_mask'],
        'token_type_ids': tokenized_val_inputs['token_type_ids'],
        'labels': val_labels_fold
    }
    
    eval_dataset_fold = Dataset.from_dict(val_data_dict)

    print(f"Fold {fold_num + 1}: Train dataset size: {len(train_dataset_fold)}, Eval dataset size: {len(eval_dataset_fold)}")

    model_fold = get_model()

    fold_output_dir = f"{finetune_output}/fold_{fold_num + 1}"

    trainer_fold = Trainer(
        model=model_fold,
        args=training_Args, # Use the args specified from the training arguments cell
        train_dataset=train_dataset_fold,
        eval_dataset=eval_dataset_fold,
        compute_metrics=compute_classification_metrics,
        tokenizer=tokenizer
    )
    print(f"Training fold {fold_num + 1}...") 
    trainer_fold.train() 

    print(f"Evaluating fold {fold_num + 1}...") 
    metrics = trainer_fold.evaluate() 

    all_fold_metrics.append(metrics) 
    print(f"Metrics for Fold {fold_num + 1}: {metrics}") 

    print("\n===== Cross-Validation Results Summary =====")

if all_fold_metrics: 
    f1_weighted_key = 'eval_f1_weighted'
    f1_macro_key = 'eval_f1_macro' 
    accuracy_key = 'eval_accuracy'
    precision_weighted_key = 'eval_precision_weighted'
    recall_weighted_key = 'eval_recall_weighted'

    # Check if the primary key for F1 exists, to avoid errors if a fold failed or metrics changed
    if not all_fold_metrics[0] or f1_weighted_key not in all_fold_metrics[0]:
        print(f"Warning: Key '{f1_weighted_key}' not found in the first fold's metrics.")
        print(f"Available keys in first fold: {all_fold_metrics[0].keys() if all_fold_metrics[0] else 'N/A'}")
        #fail safe
        potential_f1_keys = [k for k in (all_fold_metrics[0] or {}).keys() if 'f1_weighted' in k]
        if potential_f1_keys:
            f1_weighted_key = potential_f1_keys[0]
            print(f"Using alternative key for F1 weighted: '{f1_weighted_key}'")
        else:
            f1_weighted_key = None # Cannot calculate average F1 weighted

    if f1_weighted_key:
        avg_f1_weighted = np.mean([m.get(f1_weighted_key, 0) for m in all_fold_metrics])
        print(f"Average {f1_weighted_key} across {N_SPLITS} folds: {avg_f1_weighted:.4f}")


    avg_accuracy = np.mean([m.get(accuracy_key, 0) for m in all_fold_metrics])
    print(f"Average {accuracy_key} across {N_SPLITS} folds: {avg_accuracy:.4f}")

    avg_precision_weighted = np.mean([m.get(precision_weighted_key, 0) for m in all_fold_metrics])
    print(f"Average {precision_weighted_key} across {N_SPLITS} folds: {avg_precision_weighted:.4f}")

    avg_recall_weighted = np.mean([m.get(recall_weighted_key, 0) for m in all_fold_metrics])
    print(f"Average {recall_weighted_key} across {N_SPLITS} folds: {avg_recall_weighted:.4f}")

    print("\n--- Individual Fold Metrics ---")
    for i, metrics_dict in enumerate(all_fold_metrics):
        print(f"Fold {i + 1}/{N_SPLITS}:")
        for metric_name, metric_value in metrics_dict.items():
            # Only print metrics that are simple numerical values for cleaner output
            if isinstance(metric_value, (int, float)):
                print(f"  {metric_name}: {metric_value:.4f}")
            else:
                print(f"  {metric_name}: {metric_value}") # e.g. runtime, samples_per_second
else:
    print("No fold metrics were collected. Ensure your cross-validation loop ran and appended results to 'all_fold_metrics'.")

     


===== Starting Fold 1/5 =====
Tokenizing training data for fold 1...
Tokenizing validation data for fold 1...


KeyboardInterrupt: 

## Training the Model on full training data

In [30]:
final_model = get_model().to(device)
final_model_epochs = 5 

final_training_Args = TrainingArguments(
output_dir=final_model_output,
num_train_epochs=final_model_epochs,
learning_rate=LR,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=100,
weight_decay=0.01,
logging_steps=50,
eval_strategy= "no",
load_best_model_at_end=False,
save_total_limit=1,
)

final_trainer = Trainer(
    model = final_model,
    args = final_training_Args,
    train_dataset = review_train_dataset,
    tokenizer = tokenizer
)

final_trainer.train()
final_trainer.save_model()
print(f"final model saved to {final_model_output}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  final_trainer = Trainer(


Step,Training Loss
50,1.0503
100,0.9181
150,0.8757
200,0.8039
250,0.7378
300,0.7213
350,0.7574
400,0.5955
450,0.5552
500,0.5681


final model saved to ./final_sentiment_model


## Using the Final Model on The Test Set

#### Tokenizing the test data and turning it into a HF dataset

In [None]:
drug_test_df_sentiment_Label = drug_test_df['sentiment_label']
tokenized_test_Benefits = tokenize(drug_test_df['benefitsReview'])
review_test_Dict = {"input_ids" : tokenized_test_Benefits['input_ids'], "token_type_ids": tokenized_test_Benefits['token_type_ids'],"attention_mask": tokenized_test_Benefits['attention_mask'], "labels": torch.tensor(drug_test_df_sentiment_Label)}
review_test_dataset = Dataset.from_dict(review_train_Dict)


In [None]:

print("\n--- Phase 4: Final Evaluation on Test Set & Error Analysis ---")

testing_the_final_model = AutoModelForSequenceClassification.from_pretrained(final_model_output).to(device)
tokenizer = AutoTokenizer.from_pretrained(final_model_output)



eval_output_dir_test = f"{final_model_output}/test_evaluation_output" 
test_eval_args = TrainingArguments(
    output_dir=eval_output_dir_test,
    per_device_eval_batch_size=batch_size,
)



eval_trainer = Trainer(
    model=testing_the_final_model, 
    args=test_eval_args,
    compute_metrics=compute_classification_metrics, 
    tokenizer=tokenizer 
)

print("Running evaluation on the test set (review_test_dataset)...")

prediction_output_test = eval_trainer.predict(review_test_dataset) 

logits_test = prediction_output_test.predictions
true_ids_test = prediction_output_test.label_ids
predicted_ids_test = np.argmax(logits_test, axis=-1)

# Calculate overall metrics using the results from predict()
eval_pred_obj_test = EvalPrediction(predictions=logits_test, label_ids=true_ids_test)
final_test_set_metrics = compute_classification_metrics(eval_pred_obj_test)

print(f"\n--- Final Test Set Performance ---")
for key, value in final_test_set_metrics.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value}")




--- Phase 4: Final Evaluation on Test Set & Error Analysis ---


  eval_trainer = Trainer(


Running evaluation on the test set (review_test_dataset)...



--- Final Test Set Performance ---
  accuracy: 0.9411
  f1_weighted: 0.9405
  precision_weighted: 0.9406
  recall_weighted: 0.9411
