In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
from peft import LoraConfig, get_peft_model

warnings.filterwarnings("ignore")

In [2]:
# Set a random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if using multiple GPUs
torch.backends.cudnn.deterministic = True  # Make sure to set this for reproducibility

# Hyperparameters
batch_size = 2
epochs = 3
learning_rate = 2e-5

# Define valid labels for multi-label classification
valid_labels = ["דיכאון ועצבות קשה", "פציעה עצמית", "טראומה מינית", "Other"]

In [None]:
# Load pretrained model and tokenizer
token = "** Insert your token here **"  # Hugging Face token
model_name = 'google/gemma-2-9b'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    load_in_4bit=True,  # 4-bit quantization
    device_map="auto",  # Automatically map layers to available devices
    num_labels=len(valid_labels),  # Number of labels for multi-label classification
    use_auth_token=token
)

# LoRA Configuration
lora_config = LoraConfig(
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # LoRA applied to attention projections
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none"
)

# Apply LoRA to the model
bert_model = get_peft_model(bert_model, lora_config)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

In [None]:
## Loading Data
conv_info_path = 'conv_info.csv'
messages_path = 'messages_anonymized.csv'

conv_info_df = pd.read_csv(conv_info_path)
messages_df = pd.read_csv(messages_path)

conv_info_df['engagement_id'] = conv_info_df['engagement_id'].astype(str)
messages_df['engagement_id'] = messages_df['engagement_id'].astype(str)
messages_df = messages_df[messages_df['text'].notna()]
messages_df['name'] = messages_df['name'].fillna('-')

In [None]:
# Create multi-label binary representation for each conversation including "Other"
def assign_multi_labels(row):
    labels = []
    for label in valid_labels[:-1]:  # Exclude "Other" at first
        if label in [row['subject_1'], row['subject_2'], row['subject_3']]:
            labels.append(label)
    # If no valid labels found, append "Other"
    if not labels:
        labels.append("Other")
    return labels


In [5]:
# Assign multi-labels (list of labels) for each row
conv_info_df['labels'] = conv_info_df.apply(assign_multi_labels, axis=1)

# Merge messages and conversation info
merged_df = messages_df.merge(conv_info_df, on='engagement_id')

# Filter only the help seeker's messages
merged_df = merged_df[merged_df['seeker'] == True]

# Aggregate the messages so each row contains an entire conversation
merged_df = merged_df.groupby('engagement_id').agg({'text': '[SEP]'.join, 'labels': 'first'}).reset_index()

# Use MultiLabelBinarizer to convert the labels into binary vectors
mlb = MultiLabelBinarizer(classes=valid_labels)
merged_df['label'] = mlb.fit_transform(merged_df['labels']).tolist()

# Split into train and test sets, stratified by the presence of labels
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=seed)

In [None]:
## Tokenization
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=16)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Model Training
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=learning_rate)
bert_model.train()

progress_bar = tqdm(range(epochs * len(train_loader)), desc="Training")

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = torch.tensor(batch['label'], dtype=torch.float32).to(device)  # Binary labels

        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

progress_bar.close()

In [None]:
## Evaluating the Model
bert_model.eval()
labels = []
preds = []

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    label = torch.tensor(batch['label'], dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.sigmoid(logits)  # Use sigmoid for multi-label classification

    # Apply threshold to predictions (0.5 by default for binary classification)
    predictions = (predictions > 0.5).int()

    labels.extend(label.cpu().numpy())
    preds.extend(predictions.cpu().numpy())

# Convert the predictions and true labels to numpy arrays
labels = np.array(labels)
preds = np.array(preds)

In [None]:
# Print Classification Report for Multi-label
report_dict = classification_report(labels, preds, target_names=valid_labels, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Export the classification report to Excel
file_path = "gemma_multilabel_classification_report.xlsx"
report_df.to_excel(file_path)

# Display the DataFrame to view the classification report in a tabular format
report_df