In [9]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import seaborn as sns 
import numpy as np 
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction

## Reading Data

In [10]:
train_file_Path = 'drugLibTrain_raw.tsv'
test_file_Path = 'drugLibTest_raw.tsv'
drug_train_df = pd.read_csv(train_file_Path, sep = '\t')
drug_test_df = pd.read_csv(test_file_Path,sep = '\t')

In [11]:
drug_train_df['benefitsReview'] = drug_train_df['benefitsReview'].fillna('')

In [12]:
def turn_to_sentiment(ratings):
    if ratings >= 8:
        return 2
    elif ratings <= 3:
        return 0
    else:
        return 1

In [13]:
drug_train_df['sentiment_label'] = drug_train_df['rating'].apply(turn_to_sentiment)
drug_test_df['sentiment_label'] = drug_test_df['rating'].apply(turn_to_sentiment)

## Paths for things

In [14]:
model_Path = 'dmis-lab/biobert-v1.1' 
finetune_output = "./sentiment_finetuning_cv"
final_model_output = "./final_sentiment_model"
label_column = 'sentiment'
num_unique_Labels = 3
labels = ['negative','neutral','positive']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
# update the reviews (side effect or benefits)
type_review = 'benefitsReview'

## Functions for modeling and tokenizing

In [15]:
tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_Path)
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        model_Path,
        num_labels = num_unique_Labels,
        id2label = id2label,
        label2id =label2id
    )

In [16]:
def tokenize(review):
    list_form = review.tolist()
    return tokenizer(
    list_form,
    max_length=512, 
    truncation=True,             
    padding="max_length",        
    return_tensors="pt"         
)

### Training Arguments Setup

In [17]:
LR = 2e-5
epoch_Numb = 10
batch_size = 8
training_Args = TrainingArguments(
output_dir=finetune_output,
num_train_epochs=epoch_Numb,
learning_rate=LR,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=100,
weight_decay=0.01,
logging_steps=50,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_weighted",
save_total_limit=1,
)

In [18]:
def compute_classification_metrics(eval_pred: EvalPrediction):
    predictions, label_ids = eval_pred
    predicted_ids = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(y_true=label_ids, y_pred=predicted_ids)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=label_ids, y_pred=predicted_ids, average='weighted', zero_division=0
    )
    return {
        'accuracy': accuracy,
        'f1_weighted': f1,
        'precision_weighted': precision,
        'recall_weighted': recall
    }

##### just testing the above function

##### HF Dataset

In [19]:
tokenized_Benefits = tokenize(drug_train_df['benefitsReview'])

### HF Training Benefits Dataset

In [20]:
drug_train_df_sentiment_Label = drug_train_df['sentiment_label']
review_train_Dict = {"input_ids" : tokenized_Benefits['input_ids'], "token_type_ids": tokenized_Benefits['token_type_ids'],"attention_mask": tokenized_Benefits['attention_mask'], "labels": torch.tensor(drug_train_df_sentiment_Label)}

## Cross-Validation

In [21]:
N_SPLITS = 5 
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
all_fold_metrics = []
y_split_labels = drug_train_df_sentiment_Label

In [22]:
drug_train_benefits_df = drug_train_df[['benefitsReview','sentiment_label']].copy()

In [None]:
for fold_num, (train_idx, val_idx) in enumerate(skf.split(drug_train_benefits_df['benefitsReview'], y_split_labels)):
    print(f"\n===== Starting Fold {fold_num + 1}/{N_SPLITS} =====")
    df_train_fold = drug_train_benefits_df.iloc[train_idx]
    df_val_fold = drug_train_benefits_df.iloc[val_idx]
    #tokenize the reviews
    print(f"Tokenizing training data for fold {fold_num + 1}...")
    tokenized_train_inputs = tokenize(df_train_fold['benefitsReview'])
    print(f"Tokenizing validation data for fold {fold_num + 1}...")
    tokenized_val_inputs = tokenize(df_val_fold['benefitsReview'])

    #labels
    train_labels_fold = torch.tensor(df_train_fold['sentiment_label'].tolist())
    val_labels_fold = torch.tensor(df_val_fold['sentiment_label'].tolist())

    train_data_dict = {
        'input_ids': tokenized_train_inputs['input_ids'],
        'attention_mask': tokenized_train_inputs['attention_mask'],
        'token_type_ids': tokenized_train_inputs['token_type_ids'],
        'labels': train_labels_fold
    }

    train_dataset_fold = Dataset.from_dict(train_data_dict)

    val_data_dict = {
        'input_ids': tokenized_val_inputs['input_ids'],
        'attention_mask': tokenized_val_inputs['attention_mask'],
        'token_type_ids': tokenized_val_inputs['token_type_ids'],
        'labels': val_labels_fold
    }
    
    eval_dataset_fold = Dataset.from_dict(val_data_dict)

    print(f"Fold {fold_num + 1}: Train dataset size: {len(train_dataset_fold)}, Eval dataset size: {len(eval_dataset_fold)}")

    model_fold = get_model()

    fold_output_dir = f"{finetune_output}/fold_{fold_num + 1}"

    trainer_fold = Trainer(
        model=model_fold,
        args=training_Args, # Use the fold-specific args
        train_dataset=train_dataset_fold,
        eval_dataset=eval_dataset_fold,
        compute_metrics=compute_classification_metrics,
        tokenizer=tokenizer
    )
    print(f"Training fold {fold_num + 1}...") # Add this to see if it's reached
    trainer_fold.train() # <<<< MAKE SURE THIS LINE IS PRESENT AND NOT COMMENTED OUT

    print(f"Evaluating fold {fold_num + 1}...") # Add this to see if it's reached
    metrics = trainer_fold.evaluate() # <<<< MAKE SURE THIS LINE IS PRESENT AND NOT COMMENTED OUT

    all_fold_metrics.append(metrics) # <<<< MAKE SURE THIS LINE IS PRESENT AND NOT COMMENTED OUT
    print(f"Metrics for Fold {fold_num + 1}: {metrics}") # This will print the metrics dictionary for the fold

    print("\n===== Cross-Validation Results Summary =====")

if all_fold_metrics: # This is the list you'd populate in your CV loop
    # --- Define the metric keys we expect from trainer.evaluate() ---
    # These are based on your compute_classification_metrics function,
    # with "eval_" prepended by the Trainer.
    f1_weighted_key = 'eval_f1_weighted'
    f1_macro_key = 'eval_f1_macro' # Assuming you might have added this based on my earlier suggestions
    accuracy_key = 'eval_accuracy'
    precision_weighted_key = 'eval_precision_weighted'
    recall_weighted_key = 'eval_recall_weighted'

    # Check if the primary key for F1 exists, to avoid errors if a fold failed or metrics changed
    if not all_fold_metrics[0] or f1_weighted_key not in all_fold_metrics[0]:
        print(f"Warning: Key '{f1_weighted_key}' not found in the first fold's metrics.")
        print(f"Available keys in first fold: {all_fold_metrics[0].keys() if all_fold_metrics[0] else 'N/A'}")
        # Attempt to find a similar key if the exact one is missing (e.g. if you renamed it in compute_metrics)
        potential_f1_keys = [k for k in (all_fold_metrics[0] or {}).keys() if 'f1_weighted' in k]
        if potential_f1_keys:
            f1_weighted_key = potential_f1_keys[0]
            print(f"Using alternative key for F1 weighted: '{f1_weighted_key}'")
        else:
            f1_weighted_key = None # Cannot calculate average F1 weighted

    if f1_weighted_key:
        avg_f1_weighted = np.mean([m.get(f1_weighted_key, 0) for m in all_fold_metrics])
        print(f"Average {f1_weighted_key} across {N_SPLITS} folds: {avg_f1_weighted:.4f}")

    # Calculate and print average for other metrics if they exist
    if f1_macro_key in all_fold_metrics[0]: # Check if macro F1 was calculated
        avg_f1_macro = np.mean([m.get(f1_macro_key, 0) for m in all_fold_metrics])
        print(f"Average {f1_macro_key} across {N_SPLITS} folds: {avg_f1_macro:.4f}")

    avg_accuracy = np.mean([m.get(accuracy_key, 0) for m in all_fold_metrics])
    print(f"Average {accuracy_key} across {N_SPLITS} folds: {avg_accuracy:.4f}")

    avg_precision_weighted = np.mean([m.get(precision_weighted_key, 0) for m in all_fold_metrics])
    print(f"Average {precision_weighted_key} across {N_SPLITS} folds: {avg_precision_weighted:.4f}")

    avg_recall_weighted = np.mean([m.get(recall_weighted_key, 0) for m in all_fold_metrics])
    print(f"Average {recall_weighted_key} across {N_SPLITS} folds: {avg_recall_weighted:.4f}")

    print("\n--- Individual Fold Metrics ---")
    for i, metrics_dict in enumerate(all_fold_metrics):
        print(f"Fold {i + 1}/{N_SPLITS}:")
        for metric_name, metric_value in metrics_dict.items():
            # Only print metrics that are simple numerical values for cleaner output
            if isinstance(metric_value, (int, float)):
                print(f"  {metric_name}: {metric_value:.4f}")
            else:
                print(f"  {metric_name}: {metric_value}") # e.g. runtime, samples_per_second
else:
    print("No fold metrics were collected. Ensure your cross-validation loop ran and appended results to 'all_fold_metrics'.")

     


===== Starting Fold 1/5 =====
Tokenizing training data for fold 1...
Tokenizing validation data for fold 1...
Fold 1: Train dataset size: 2485, Eval dataset size: 622


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_fold = Trainer(


Training fold 1...




Epoch,Training Loss,Validation Loss


Python executable being used by this session: /Users/Long/Documents/NLP_SentimentAnalysis_Drug/nlp_env/bin/python
------------------------------
Running: !pip show accelerate (or equivalent)
Output of 'pip show accelerate':
Note: you may need to restart the kernel to use updated packages.

------------------------------
'accelerate' library could NOT be imported. It's likely not installed in this environment or the environment path is incorrect.
------------------------------
Attempting to instantiate TrainingArguments...


  from .autonotebook import tqdm as notebook_tqdm


Transformers version for this attempt: 4.51.3
STILL FAILED with TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'
If you see this with transformers 4.51.3, something is very unusual about your environment or TrainingArguments import.
