<a href="https://colab.research.google.com/github/DanielHevdeli/hafifot-tiug/blob/main/BERT_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/DanielHevdeli/hafifot-tiug.git

Cloning into 'hafifot-tiug'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 85 (delta 34), reused 33 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (85/85), 7.59 MiB | 8.14 MiB/s, done.
Resolving deltas: 100% (34/34), done.


In [2]:
import pandas as pd

In [3]:
posts_df = pd.read_csv("./hafifot-tiug/data/split_data/present.csv")

In [4]:
print(f"Num of posts: {len(posts_df)}")

Num of posts: 4000


In [5]:
posts_df.head(2)

Unnamed: 0,question_id,length,date,text
0,114678,903,2016-04-10 20:05:00,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י..."
1,116349,888,2016-04-24 19:56:00,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...


In [6]:
labels_df = pd.read_csv("./hafifot-tiug/data/labels/present/dummy-gpt-4_labels.csv")

In [7]:
print(f"Num of labels: {len(labels_df)}")

Num of labels: 4000


In [8]:
labels_df.head(2)

Unnamed: 0,question_id,label
0,114678,suicidal
1,116349,non-suicidal


In [9]:
merged_df = pd.merge(posts_df[['question_id', 'text']], labels_df, on='question_id')
posts_with_labels_df = merged_df[['question_id', 'text', 'label']]
display(posts_with_labels_df.head())

Unnamed: 0,question_id,text,label
0,114678,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י...",suicidal
1,116349,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...,non-suicidal
2,108590,הגעתי לאתר הזה במקרה אחרי שקראתי טיפה מפוסטים ...,non-suicidal
3,100709,"בערך מגיל 17 אני סובלת מחרדה, כאשר חלה החרפה ק...",suicidal
4,120766,"אני תוהה כבר כמה שנים מה ללמוד בעתיד, איזה מקצ...",non-suicidal


In [10]:
print(f"posts_with_labels_df len: {len(posts_with_labels_df)}")

posts_with_labels_df len: 4000


# Create a BERT Classifier

In [11]:
import torch
from torch import nn
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from torchsummary import summary
from datasets import Dataset, DatasetDict
import numpy as np
import matplotlib.pyplot as plt

In [12]:
label2id = {"non-suicidal": 0, "suicidal": 1}
id2label = {0: "non-suicidal", 1: "suicidal"}
df = posts_with_labels_df.copy()
df["labels"] = df["label"].map(label2id)


In [13]:
df.head(2)

Unnamed: 0,question_id,text,label,labels
0,114678,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י...",suicidal,1
1,116349,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...,non-suicidal,0


In [14]:
TEST_SIZE = 0.1

In [15]:
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, stratify=df["labels"], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=TEST_SIZE, stratify=train_df["labels"], random_state=42)

In [16]:
train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
val_ds   = Dataset.from_pandas(val_df[["text", "labels"]])
test_ds  = Dataset.from_pandas(test_df[["text", "labels"]])

ds = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

In [29]:
MODEL_NAME = "HeNLP/LongHeRo" # "onlplab/alephbert-base"
model_short_name = MODEL_NAME.split("/")[-1]
MODEL_CONTEXT_LEN = 4096
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [18]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MODEL_CONTEXT_LEN)

ds = ds.map(tokenize, batched=True)

Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [19]:
ds = ds.remove_columns(["text"])
ds.set_format(type="torch")

In [20]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3240
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 360
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [23]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    # precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    #     labels, preds, average=None, zero_division=0
    # )
    # min accuracy
    correct_per_class = []
    for cls in [0, 1]:
        mask = (labels == cls)
        class_acc = (preds[mask] == cls).sum() / mask.sum()
        correct_per_class.append(class_acc)
    min_acc = min(correct_per_class)

    return {
        "accuracy": acc,
        "min_accuracy": min_acc,
    }

In [24]:
BATCH_SIZE = 16
EPOCHS = 10

In [25]:
training_args = TrainingArguments(
    report_to=[],  # I dont want to save my expirements to W&B or tensorboard
    output_dir=f"./hafifot-tiug/models/{model_short_name}",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="min_accuracy",
    greater_is_better=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True
)

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [28]:
trainer.train()

RuntimeError: The size of tensor a (535) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
history = trainer.state.log_history

train_logs = [h for h in trainer.state.log_history if "loss" in h and "eval_loss" not in h]
val_logs   = [h for h in trainer.state.log_history if "eval_loss" in h]

In [None]:
def extract(logs, keys):
    data = {k: [] for k in keys}
    for h in logs:
        for k in keys:
            data[k].append(h.get(k, None))
    return data

train_loss = extract(train_logs, ["loss"])
val_metrics = extract(val_logs, ["eval_loss", "eval_accuracy", "eval_min_accuracy"])


In [None]:
epochs = range(1, len(train_loss["loss"]) + 1)

plt.figure(figsize=(6,4))
plt.plot(epochs, train_loss["loss"], label="Train Loss")
plt.plot(epochs, val_metrics["eval_loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.xticks(epochs)
plt.ylabel("Loss")
plt.title("Train vs Validation Loss")
plt.legend()
plt.show()

In [None]:
metric_names = ["eval_accuracy", "eval_min_accuracy"]
num_metrics = len(metric_names)

# create a single row of subplots
fig, axes = plt.subplots(1, num_metrics, figsize=(6*num_metrics, 4))  # width scales with number of metrics

for i, metric in enumerate(metric_names):
    ax = axes[i] if num_metrics > 1 else axes
    ax.plot(epochs, val_metrics[metric], label=f"Validation {metric}")
    ax.set_xlabel("Epoch")
    ax.set_xticks(epochs)
    ax.set_ylabel(metric)
    ax.set_title(f"{metric} per Epoch")
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
def print_classification_report(texts, title):
    pred = trainer.predict(texts)
    logits = pred.predictions
    labels = pred.label_ids
    preds = logits.argmax(-1)

    print(f"\n====== CLASSIFICATION REPORT ({title}) ======\n")
    print(classification_report(labels, preds, target_names=list(id2label.values())))

In [None]:
print_classification_report(ds["train"], title="Train")

In [None]:
print_classification_report(ds["validation"], title="Validation")

In [None]:
def predict(texts):
    enc = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
      logits = model(**enc).logits
    preds = logits.argmax(dim=-1).cpu().numpy()
    return [id2label[int(p)] for p in preds]

In [None]:
print(predict([
    "אני מרגישה ממש רע היום",
    "איזה יום טוב היה לי היום"
]))