In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_pandas
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizerFast, Trainer, AutoTokenizer,TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification, BertTokenizer,AutoModelForCausalLM,BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainerCallback
from transformers import get_scheduler, DataCollatorWithPadding

import torch
import torch.nn as nn
from torch.optim import AdamW
from datasets import Dataset
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, fbeta_score, precision_score

from huggingface_hub import login
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from torch.cuda.amp import autocast, GradScaler

import evaluate

import warnings
warnings.filterwarnings("ignore")

import wandb
from bitsandbytes.optim import AdamW8bit

from sklearn.metrics import (
    precision_recall_fscore_support,
    fbeta_score,
    accuracy_score,
    classification_report
)


import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.optim import AdamW
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score

from torchmetrics.functional import f1_score, accuracy
from sklearn.metrics import precision_score, recall_score
from transformers import get_linear_schedule_with_warmup

In [None]:
batch_size = 1
epochs = 10
learning_rate = 7e-6
num_labels = 2

In [None]:
wandb.login()

# Load QLoRA model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:",device)

token = "your hugging face token here"

cap = torch.cuda.get_device_capability()
use_bf16 = cap[0] >= 8  # Ampere+ usually ok
dtype = torch.bfloat16 if use_bf16 else torch.float16

model_name = "QCRI/Fanar-1-9B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

print("loading tokenizer")
tok = AutoTokenizer.from_pretrained("QCRI/Fanar-1-9B-Instruct", token=token)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.padding_side = "right"
tok.truncation_side = "right"

print("loading model")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=token,
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=dtype,
    attn_implementation="eager",   # avoid CUDA kernel asserts
)

model.config.pad_token_id = tok.pad_token_id

# define classification head to replace LM head
class ReducedLinear(nn.Module):
    def __init__(self, in_features, original_weight=None):
        # original_weight is the lm head weights matrix
        super().__init__()
        # Extract only the 2 rows of interest
        self.weight = nn.Parameter(original_weight[[125596, 125594], :])  # [125596, 125594] are token ids of '0' and '1', which matches it with the prompt Shape: [2, 2560]

    def forward(self, x):
        return torch.matmul(x, self.weight.t())  # Output shape: [batch_size, 2]

model.lm_head = ReducedLinear(model.lm_head.in_features, model.lm_head.weight)


# model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print("adding QLoRA adapters")
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,  
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    modules_to_save=["lm_head"],   
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# prepare data

In [None]:
conversations_path = "arabic_conversations.csv"
messages_path = "arabic_messages.csv"
ARABIC_LEXICON_PATH = "new_arabic_lexicon_17_07.csv"
TOTAL_LEXICON_CATEGORIES = 47
LEXICON_ARABIC_PHRASE_COLUMN = "Arabic Phrase (Linor Translation / Approval)"

# MODEL_NAME = 'UBC-NLP/MARBERTv2'

test_size = 0.3
seed = 42

## Load saved datasets splits from pickles 

In [None]:
import pickle, os
load_dir = "/home/swaida/master/saved_objects"

pkl_train_conv = pickle.load(open(os.path.join(load_dir, "train_conv.pkl"), 'rb'))
pkl_test_conv = pickle.load(open(os.path.join(load_dir, "test_conv.pkl"), 'rb'))
pkl_unlabeled_msgs = pickle.load(open(os.path.join(load_dir, "unlabeled_msgs.pkl"), 'rb'))
pkl_train_msgs = pickle.load(open(os.path.join(load_dir, "train_msgs.pkl"), 'rb'))
pkl_test_msgs = pickle.load(open(os.path.join(load_dir, "test_msgs.pkl"), 'rb'))
pkl_pretrain_messages_df = pickle.load(open(os.path.join(load_dir, "pretrain_messages_df.pkl"), 'rb'))



seeker_messages_train = pkl_train_msgs[pkl_train_msgs['seeker']]
full_convs_train = seeker_messages_train.groupby('engagement_id')['text'].apply(lambda texts: '. '.join(texts))

seeker_messages_test = pkl_test_msgs[pkl_test_msgs['seeker']]
full_convs_test = seeker_messages_test.groupby('engagement_id')['text'].apply(lambda texts: '. '.join(texts))

In [None]:
assert len(set(pkl_train_msgs.engagement_id) | set(pkl_train_conv.engagement_id)) == len(pkl_train_conv)

# pkl_train_conv.shape
all_subjects = pd.concat([pkl_train_conv.subject_1, pkl_train_conv.subject_2, pkl_train_conv.subject_3]).dropna()
all_subjects.value_counts()

In [None]:
def create_binary_label(df):
    df['label'] = df.apply(lambda row: 1 if 'טראומה מינית' in [row['subject_1'], row['subject_2'], row['subject_3']] else 0, axis=1)
    return df

def plot_label_distribution(df_train, df_test):
    train_counts = df_train['label'].value_counts()
    test_counts = df_test['label'].value_counts()
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    
    # --- Train Plot ---
    axes[0].pie(
        train_counts,
        labels=[f'{label}\n{count} ({count / train_counts.sum() * 100:.1f}%)' 
                for label, count in zip(train_counts.index, train_counts)],
        startangle=90
    )
    axes[0].set_title('Train Label Distribution')
    
    # --- Test Plot ---
    axes[1].pie(
        test_counts,
        labels=[f'{label}\n{count} ({count / test_counts.sum() * 100:.1f}%)' 
                for label, count in zip(test_counts.index, test_counts)],
        startangle=90
    )
    axes[1].set_title('Test Label Distribution')
    
    plt.tight_layout()
    plt.show()
    
pkl_train_conv = create_binary_label(pkl_train_conv)
pkl_test_conv = create_binary_label(pkl_test_conv)

from sklearn.model_selection import train_test_split
concated = pd.concat([pkl_train_conv,pkl_test_conv])
pkl_train_conv, pkl_test_conv = train_test_split(concated, test_size=0.3, stratify=concated['label'], random_state=42)

all_msgs = pd.concat([pkl_train_msgs, pkl_test_msgs]) 

pkl_train_msgs = all_msgs[all_msgs['engagement_id'].isin(pkl_train_conv['engagement_id'].unique())]
pkl_test_msgs = all_msgs[all_msgs['engagement_id'].isin(pkl_test_conv['engagement_id'].unique())]


seeker_messages_train = pkl_train_msgs[pkl_train_msgs['seeker']]
full_convs_train = seeker_messages_train.groupby('engagement_id')['text'].apply(lambda texts: '. '.join(texts))

seeker_messages_test = pkl_test_msgs[pkl_test_msgs['seeker']]
full_convs_test = seeker_messages_test.groupby('engagement_id')['text'].apply(lambda texts: '. '.join(texts))


plot_label_distribution(pkl_train_conv, pkl_test_conv)

In [None]:
from torch.nn.utils.rnn import pad_sequence


def insert_prompt(conv_text, without_answer=True):
    prompt = "النص التالي هو محادثة لشخص مع طبيب نفسي, هل يقول الشخص انه يتعرض لمضايقة جنسية او تحرش جنسي؟ اجب فقط 1(نعم) او 0(لا) باللغة العربية"
    mention_domain_knowledge = "الجمل التالية هي مؤشرات لرغبة بالانتحار او ايذاء النفس:"
    yes = "نعم"
    no = "لا"
    the_text = "المجادثة:"
    answer = "الاجابة:"
    

    if without_answer:
        return f"""{prompt}
            
            
            {the_text} {conv_text}"""
    else:
        return f"""{prompt}
        {mention_domain_knowledge}
        {SUICIDE_PHRASES}
    
        
        {the_text} {conv_text}
        
        {answer} """

    
def change_dataset_content_structure():
    train_df = pd.merge(pkl_train_conv, full_convs_train.reset_index(), on="engagement_id", how="inner")[['engagement_id', 'label','text']]
    test_df = pd.merge(pkl_test_conv, full_convs_test.reset_index(), on="engagement_id", how="inner")[['engagement_id', 'label','text']]
    
    assert len(train_df) == len(full_convs_train)
    assert len(test_df) == len(full_convs_test)

    train_df.loc[:, 'text_prompt_and_conv'] = train_df['text'].apply(insert_prompt)
    test_df.loc[:, 'text_prompt_and_conv'] = test_df['text'].apply(insert_prompt)

    return train_df, test_df

def tokenize(batch, dont_truncate_answer=True):
    # print(batch)
    if dont_truncate_answer:
        ANSWER = "الاجابة:"
        prompt_answer_postfix = f"""
        
        {ANSWER} """
        enc = tok(batch['text_prompt_and_conv'], padding=False, truncation=True, max_length=700)
        enc_prompt_answer_postfix = tok(prompt_answer_postfix, add_special_tokens=False, padding=False, truncation=True, max_length=1000)

        enc["labels"] = batch["label"]
        for enc_key in ['input_ids', 'attention_mask']:
            assert isinstance(enc[enc_key], list) and isinstance(enc_prompt_answer_postfix[enc_key], list) # the next "+" operation assumes they are lists
            enc[enc_key] = enc[enc_key] + enc_prompt_answer_postfix[enc_key]

        return enc

    else:
        enc = tok(batch['text'], padding=False, truncation=True, max_length=1000)
        enc["labels"] = batch["label"]
        return enc

def tensors_cllate_fn(batch):
    to_t = lambda x: torch.tensor(x, dtype=torch.long)
    input_ids = pad_sequence([to_t(b["input_ids"]) for b in batch], batch_first=True, padding_value=tok.pad_token_id)
    attention = pad_sequence([to_t(b["attention_mask"]) for b in batch], batch_first=True, padding_value=0)
    labels    = torch.cat([to_t(b["labels"]).unsqueeze(0) for b in batch])
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention,
        "labels": labels,                 # HF CausalLM will compute CE on non -100
    }

train_df, test_df = change_dataset_content_structure() # this also calls insert_prompt

train_dataset = Dataset.from_pandas(train_df).map(tokenize, remove_columns=train_df.columns.to_list())
test_dataset = Dataset.from_pandas(test_df).map(tokenize, remove_columns=train_df.columns.to_list())

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=tensors_cllate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=tensors_cllate_fn)

# Define model class

In [None]:
class FanarClassifier(pl.LightningModule):
    def __init__(self, hf_model, learning_rate=7e-6):
        super().__init__()
        self.save_hyperparameters(ignore=['hf_model'])
        
        self.model = hf_model  # returns logits of shape [B, 2]

        self.loss_fn = nn.CrossEntropyLoss(weight = torch.tensor([0.0001, 0.9], device="cuda")  # example weights
)
        self.learning_rate = learning_rate

        self.val_all_preds = []
        self.val_all_labels = []

    def forward(self, batch):
        input_ids=batch["input_ids"]
        attention_mask=batch["attention_mask"]
        labels=batch["labels"]
        output_obj = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = output_obj.logits[:, -1, :]  # take only the logits of the last token in the last linear layer. [B, 2]
        loss = None
        
        if labels is not None:
            loss = self.loss_fn(logits, labels.long())
        else:
            raise Exception("error: why are labels==None ??!!")

        preds = (logits.argmax(dim=1)).long() # shape [B]

        
        assert (loss is not None) 
        return {"logits": logits, "loss": loss, "preds": preds}

    def training_step(self, batch, batch_idx):
        out = self(batch)
        
        preds = out["preds"]
        
        self.log("train_loss", out["loss"], on_step=True, on_epoch=True, prog_bar=True)
        
        return out["loss"]

    def validation_step(self, batch, batch_idx):
        out = self(batch)

        
        preds = out["preds"]

        self.val_all_preds.append(preds)
        self.val_all_labels.append(batch["labels"])
        
        self.log("val_loss", out["loss"], on_step=False, on_epoch=True, prog_bar=True)
        
        return out["loss"]

    def on_validation_epoch_end(self):
        all_preds, all_labels = torch.cat(self.val_all_preds), torch.cat(self.val_all_labels)
        print(f"eppcj preds shape: {all_preds.shape}, epoch labels shape: {all_labels.shape}")
        
        epoch_acc = accuracy(task="binary", preds=all_preds, target=all_labels)
        precision = precision_score(all_labels.cpu().numpy(), all_preds.cpu().numpy())
        recall = recall_score(all_labels.cpu().numpy(), all_preds.cpu().numpy())
        epoch_f1 = f1_score(task="binary", preds=all_preds, target=all_labels)
        epoch_f2 = fbeta_score(all_labels.cpu().numpy(), all_preds.cpu().numpy(), beta=2, labels=[0,1], average=None, zero_division=0)[1]
        
        self.log("epoch_val_acc", epoch_acc, prog_bar=True)
        self.log("epoch_val_precision", precision, prog_bar=True)
        self.log("epoch_val_recall", recall, prog_bar=True)
        self.log("epoch_val_f1", epoch_f1, prog_bar=True)
        self.log("epoch_val_f2", epoch_f2, prog_bar=True)

        with open("model_preds_DK.txt", "w+") as f:
            f.write(str(self.val_all_preds))

        with open("true_labels_DK.txt", "w+") as f:
            f.write(str(self.val_all_labels))
            
        self.val_all_preds.clear()
        self.val_all_labels.clear()
        
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=3,
            num_training_steps=len(train_loader)*epochs  # replace with actual total steps
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
                "frequency": 1
            }
        }


# Train
**Evaluation logs on wandb**

In [None]:
my_model = FanarClassifier(model, learning_rate=2e-5)

exp_name = "Fanar (lr=2e-5) seq_len=700"
wandb_logger = WandbLogger(project="sexual-hurt-pred", name=exp_name,
                               group="decoder", resume=True)

trainer = pl.Trainer(
    max_epochs=6,
    accumulate_grad_batches=8,
    log_every_n_steps=20,
    val_check_interval=0.25,  # validate 4x/epoch; adjust as desired
    precision="16-mixed",
    logger=wandb_logger
)

trainer.fit(my_model, train_loader, test_loader)
wandb.finish()