In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_pandas
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizerFast, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification, BertTokenizer,AutoModelForCausalLM,BitsAndBytesConfig
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,  roc_auc_score, fbeta_score
from huggingface_hub import login
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import autocast, GradScaler

import warnings
warnings.filterwarnings("ignore")

In [None]:
# hyperparameters
batch_size = 2
epochs = 3
learning_rate = 2e-5

## Creating Model and Tokenizer

In [None]:
# Use the token in your code
token = "** Insert your token here **"

model_name = 'google/gemma-2-9b'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

# Load the model with 4-bit quantization
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    load_in_4bit=True,  # Enables 4-bit quantization
    device_map="auto",  # Automatically allocates layers to the available devices
    use_auth_token=token
)

lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor for LoRA
    target_modules=["q_proj", "v_proj"],  # Layers to apply LoRA
    lora_dropout=0.1,  # Dropout rate for LoRA layers
    bias="none",  # Bias configuration
)

# Apply LoRA to the model
bert_model = get_peft_model(bert_model, lora_config)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)

## Loading Data

In [None]:
conv_info_path = 'conv_info.csv'
messages_path = 'messages_anonymized.csv'

conv_info_df = pd.read_csv(conv_info_path)
messages_df = pd.read_csv(messages_path)

conv_info_df['engagement_id'] = conv_info_df['engagement_id'].astype(str)
messages_df['engagement_id'] = messages_df['engagement_id'].astype(str)
messages_df = messages_df[messages_df['text'].notna()]
messages_df['name'] = messages_df['name'].fillna('-')

In [None]:
ids = conv_info_df['engagement_id']
conv_info_df = conv_info_df[conv_info_df['engagement_id'].isin(ids)]
messages_df = messages_df[messages_df['engagement_id'].isin(ids)]

In [None]:
# Create a binary label for דיכאון ועצבות קשה
def create_binary_label(df):
    df['label'] = df.apply(lambda row: 1 if 'פציעה עצמית' in [row['subject_1'], row['subject_2'], row['subject_3']] else 0, axis=1)
    return df

conv_info_df = create_binary_label(conv_info_df)

## Preprocessing Data

In [None]:
# grouping messages with conversation information to pair text and label
merged_df = messages_df.merge(conv_info_df, on='engagement_id')

# for better results we take only text from help seeker
merged_df = merged_df[merged_df['seeker'] == True]

# aggregating messages such that each row contains the entire conversation
merged_df = merged_df.groupby('engagement_id').agg({'text': ' '.join, 'label': 'first'}).reset_index()

# split to train and test stratisfied by label
train_df, test_df = train_test_split(merged_df, test_size=0.2, stratify=merged_df['label'])

## Create Dataloaders

In [None]:
# creating Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# mapping the text into inputs that fits the model
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=16)

# setting the format to pytorch tensors
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Training the Model

In [None]:
for batch in train_loader:
    print(batch)
    break

In [None]:
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=learning_rate)
bert_model.train()

progress_bar = tqdm(range(epochs * len(train_loader)), desc="Training")

for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

progress_bar.close()

## Evaluating the Model

In [None]:
bert_model.eval()
labels = []
preds = []
pred_probs = []

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    label = batch['label'].to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predictions = torch.argmax(logits, dim=-1)

    labels.extend(label.cpu().numpy())
    preds.extend(predictions.cpu().numpy())
    pred_probs.extend(probabilities[:, 1].cpu().numpy())

In [None]:
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
roc_auc = roc_auc_score(labels, pred_probs)
f2 = fbeta_score(labels, preds, beta=2)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print(f'ROC-AUC: {roc_auc}')
print(f'F2: {f2}')