In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, accuracy_score
import torch.optim as optim

from transformers.models.bert.modeling_bert import BertOnlyMLMHead


from transformers import (
    BertConfig,
    BertModel,
    AutoConfig,
    PreTrainedModel,
)


from sklearn.metrics import (
    precision_recall_fscore_support,
    fbeta_score,
    accuracy_score,
    classification_report
)

from datetime import datetime
get_time = lambda: f"[{datetime.now():%H:%M:%S}]"

# Define some utility functions

In [None]:
def seed_everything(seed=42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def classification_report_with_f2(y_true, y_pred, label_names=None, beta=2.0, digits=4):
    # 1) Fix label set and order explicitly
    labels = sorted(np.unique(np.concatenate([y_true, y_pred])))

    # 2) Consistent zero_division across all calls
    zd = 0

    # Per-class metrics
    p, r, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average=None, zero_division=zd
    )
    f2 = fbeta_score(y_true, y_pred, beta=beta, labels=labels, average=None, zero_division=zd)

    # Macro and weighted (match classification_report rows)
    p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=zd
    )
    p_w, r_w, f1_w, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="weighted", zero_division=zd
    )
    f2_macro = fbeta_score(y_true, y_pred, beta=beta, labels=labels, average="macro", zero_division=zd)
    f2_weighted = fbeta_score(y_true, y_pred, beta=beta, labels=labels, average="weighted", zero_division=zd)

    acc = accuracy_score(y_true, y_pred)

    # Build a DataFrame like classification_report + F2
    index = [str(l) for l in labels] if label_names is None else list(label_names)
    df = pd.DataFrame(
        {
            "precision": p,
            "recall": r,
            "f1-score": f1,
            "f2-score": f2,
            "support": support.astype(int),
        },
        index=index
    )
    # print(f2)
    print("\n", df.round(3))

    return df.round(4), f2[1]

# Load and prepare data

In [None]:
import os
import pickle

load_dir = "./saved_objects"

pkl_train_conv = pickle.load(open(os.path.join(load_dir, "train_conv.pkl"), 'rb'))
pkl_test_conv = pickle.load(open(os.path.join(load_dir, "test_conv.pkl"), 'rb'))
pkl_unlabeled_msgs = pickle.load(open(os.path.join(load_dir, "unlabeled_msgs.pkl"), 'rb'))
pkl_train_msgs = pickle.load(open(os.path.join(load_dir, "train_msgs.pkl"), 'rb'))
pkl_test_msgs = pickle.load(open(os.path.join(load_dir, "test_msgs.pkl"), 'rb'))
pkl_pretrain_messages_df = pickle.load(open(os.path.join(load_dir, "pretrain_messages_df.pkl"), 'rb'))


In [None]:
ARABIC_LEXICON_PATH = "../new_arabic_lexicon_17_07.csv"
needed_categories = ["Past suicidal history", "Family suicide history", "Suicidal ideation", "Hopelessness", "Deliberate self harm", "Perceived burdensomeness"]


# Init DataSets, DataLoaders, optimizer and model then train our pretrained model

In [None]:

model_id = "./pretrain/mlm & reg arabertv02 large (lr=7e-6)/bert_pretrained"

tokenizer_id = "aubmindlab/bert-large-arabertv02"

conversations_path = "arabic_conversations.csv"
messages_path = "arabic_messages.csv"

seed = 42
seed_everything(seed)

num_labels = 2
max_length = 512
test_size = 0.3

epochs = 30
batch_size = 16
learning_rate = 5e-6
log_every_n = 200


max_f2_score = 0 # best f2 score over the epochs
best_model = None # best model over the epochs (by f2 score)
best_model_epoch = 0 # what epoch was the best model
last_f2_value = 0 # value of f2 at the end of training (all epochs)

current_run_time = datetime.now().strftime("%d-%m-%Y-_%H-%M")
save_dir = f"./BERT_saves/run - {current_run_time}"

all_evaluation_epochs = []  # list of eval metrics for each epoch


class SaharDataset(Dataset):
    def __init__(self, conversations_df, messages_df, tokenizer, max_length=max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Step 1: Filter messages to seekers only
        seeker_messages = messages_df[messages_df['seeker']]
        print(f"seeker_messages: {len(seeker_messages)}, unique_seeker_messages_engagment_id: {seeker_messages.engagement_id.unique().shape}")

        # Step 2: Group by engagement_id and concatenate text
        grouped = seeker_messages.groupby('engagement_id')['text'].apply(lambda texts: '. '.join(texts))
        print(f"convs: grouped:{len(grouped)}, unique_msgs_engagment_id: {messages_df.engagement_id.unique().shape}, total convs:", len(conversations_df))
        
        # Step 3: Merge with conversations to get labels
        merged = pd.merge(grouped, conversations_df[['engagement_id', 'gsr']], on='engagement_id')

        self.labels = merged['gsr'].tolist()

        # Step 4: Tokenize all texts now
        self.encodings = tokenizer(
            merged['text'].tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        self.texts = merged['text'].values


    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx],
            "text": self.texts[idx]
        }

        
def create_dataloaders(conversations_df, messages_df, tokenizer, batch_size=16, seed=42, test_size=0.2, max_length=max_length):
    global pkl_train_conv
    global pkl_test_conv

    train_conv, test_conv = pkl_train_conv, pkl_test_conv
    
    # extract messages based on engagement_id from conversations splits
    train_ids = set(train_conv['engagement_id'].values)
    test_ids = set(test_conv['engagement_id'].values)

    
    train_msgs = messages_df[messages_df['engagement_id'].isin(train_ids)]
    test_msgs = messages_df[messages_df['engagement_id'].isin(test_ids)]

    # Create datasets
    train_dataset = SaharDataset(train_conv, train_msgs, tokenizer, max_length=max_length)
    test_dataset = SaharDataset(test_conv, test_msgs, tokenizer, max_length=max_length)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, test_loader

    
def evaluate(model, data_loader, device, curr_epoch):
    """
    Iterate over evaluation data loader and print its evaluation results along with 
    current epoch number, this function is called after each training epoch to track performance over epochs
    """
    global max_f2_score
    global best_model
    global best_model_epoch
    global last_f2_value
    global epochs
    
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    print(f"Epoch: {curr_epoch}")
    metrics_df, f2 = classification_report_with_f2(all_labels, all_preds)
    
    all_evaluation_epochs.append(metrics_df.iloc[1, :].to_dict())

    if max_f2_score < f2: # save best version
        max_f2_score = f2
        best_model = model
        best_model_epoch = curr_epoch

    if curr_epoch == epochs:
        last_f2_value = f2
        
    return f2


def load_model(model_dir = "./training_test"):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)

    return model, tokenizer


def train(
    model,
    train_loader,
    test_loader,
    optimizer,
    loss_fn,
    device,
    epochs=3,
    scheduler=None,
    log_every_n=10
):
    model.to(device)
    global current_run_time
    global last_f2_value
    
    for epoch in range(epochs):
        model.train()
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            if scheduler:
                scheduler.step()

            if (batch_idx + 1) % log_every_n == 0:
                print(f"Epoch [{epoch+1}/{epochs}] Batch [{batch_idx+1}/{len(train_loader)}] Loss: {loss.item():.4f}")

        f2 = evaluate(model, test_loader, device, curr_epoch=epoch+1)
        model.save_pretrained(f"{save_dir}/epoch={epoch+1}_f2={f2:.3f}__{current_run_time}")

    
    # best_model_dir = "best_model_save"
    if best_model is not None:
        best_model.save_pretrained(f"{save_dir}/best f2={max_f2_score:.3f}_epoch={best_model_epoch}_{current_run_time}")
    # tokenizer.save_pretrained(f"{save_dir}/best f2={max_f2_score}_epoch={best_model_epoch}_{current_run_time}")
    return model

print("Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# return
print("Loading conversations and messages...")
conversations = pd.read_csv(conversations_path)
messages = pd.read_csv(messages_path)

print("Creating dataloaders...")
train_loader, test_loader = create_dataloaders(conversations, messages, tokenizer, batch_size=batch_size, seed=seed, test_size=test_size, max_length=max_length)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_loader)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Starting training...\n")
model = train(
    model,
    train_loader,
    test_loader,
    optimizer,
    loss_fn,
    device,
    epochs=epochs,
    scheduler=scheduler,
    log_every_n=log_every_n
)

train_finish_save_dir = f"{save_dir}/last_epoch={epochs}_f2={last_f2_value:.3f}"


model.save_pretrained(train_finish_save_dir)
tokenizer.save_pretrained(train_finish_save_dir)

all_evaluation_epochs_df = pd.DataFrame(all_evaluation_epochs)
all_evaluation_epochs_df['epoch'] = np.arange(1, len(all_evaluation_epochs_df)+1)
all_evaluation_epochs_df[["epoch", 'precision','recall','f1-score','f2-score', 'support']].to_csv(f"{save_dir}/eval_by_epoch", index=False)
print(f"Model and tokenizer saved to {save_dir}")

