<a href="https://colab.research.google.com/github/DanielHevdeli/hafifot-tiug/blob/main/BERT_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/DanielHevdeli/hafifot-tiug.git

Cloning into 'hafifot-tiug'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 82 (delta 32), reused 33 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 7.58 MiB | 5.25 MiB/s, done.
Resolving deltas: 100% (32/32), done.


In [2]:
import pandas as pd

In [3]:
posts_df = pd.read_csv("./hafifot-tiug/data/split_data/present.csv")

In [4]:
print(f"Num of posts: {len(posts_df)}")

Num of posts: 4000


In [5]:
posts_df.head(2)

Unnamed: 0,question_id,length,date,text
0,114678,903,2016-04-10 20:05:00,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י..."
1,116349,888,2016-04-24 19:56:00,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...


In [6]:
labels_df = pd.read_csv("./hafifot-tiug/data/labels/present/dummy-gpt-4_labels.csv")

In [7]:
print(f"Num of labels: {len(labels_df)}")

Num of labels: 4000


In [8]:
labels_df.head(2)

Unnamed: 0,question_id,label
0,114678,suicidal
1,116349,non-suicidal


In [9]:
merged_df = pd.merge(posts_df[['question_id', 'text']], labels_df, on='question_id')
posts_with_labels_df = merged_df[['question_id', 'text', 'label']]
display(posts_with_labels_df.head())

Unnamed: 0,question_id,text,label
0,114678,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י...",suicidal
1,116349,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...,non-suicidal
2,108590,הגעתי לאתר הזה במקרה אחרי שקראתי טיפה מפוסטים ...,non-suicidal
3,100709,"בערך מגיל 17 אני סובלת מחרדה, כאשר חלה החרפה ק...",suicidal
4,120766,"אני תוהה כבר כמה שנים מה ללמוד בעתיד, איזה מקצ...",non-suicidal


In [10]:
print(f"posts_with_labels_df len: {len(posts_with_labels_df)}")

posts_with_labels_df len: 4000


# Create a BERT Classifier

In [11]:
import torch
from torch import nn
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from torchsummary import summary
from datasets import Dataset, DatasetDict
import numpy as np
import matplotlib.pyplot as plt

In [12]:
label2id = {"non-suicidal": 0, "suicidal": 1}
id2label = {0: "non-suicidal", 1: "suicidal"}
df = posts_with_labels_df.copy()
df["labels"] = df["label"].map(label2id)


In [13]:
df.head(2)

Unnamed: 0,question_id,text,label,labels
0,114678,"שלום , אני מאוד מקווה שתוכלו לעזור לי אני לא י...",suicidal,1
1,116349,היי כולם \nיש לי בעיה הקשורה לתספורת שלי. \nאמ...,non-suicidal,0


In [15]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["labels"], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df["labels"], random_state=42)

In [16]:
train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
val_ds   = Dataset.from_pandas(val_df[["text", "labels"]])
test_ds  = Dataset.from_pandas(test_df[["text", "labels"]])

ds = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

In [17]:
MODEL_NAME = "onlplab/alephbert-base"
MAX_LEN = 256
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [18]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

ds = ds.map(tokenize, batched=True)

Map:   0%|          | 0/2560 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [19]:
ds = ds.remove_columns(["text"])
ds.set_format(type="torch")

In [20]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2560
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 640
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
})

In [21]:
NUM_LABELS = 2
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# summary(model=model, )

In [45]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    # precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    #     labels, preds, average=None, zero_division=0
    # )
    # min accuracy
    correct_per_class = []
    for cls in [0, 1]:
        mask = (labels == cls)
        class_acc = (preds[mask] == cls).sum() / mask.sum()
        correct_per_class.append(class_acc)
    min_acc = min(correct_per_class)

    return {
        "accuracy": acc,
        "min_accuracy": min_acc,
    }

In [46]:
BATCH_SIZE = 16

In [47]:
training_args = TrainingArguments(
    report_to=[],  # I dont want to save my expirements to W&B or tensorboard
    output_dir="./hafifot-tiug/models/alephbert-base",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="min_accuracy",
    greater_is_better=True,
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True
)

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [50]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Min Accuracy
1,0.1575,2.791598,0.495312,0.352201
2,0.0506,2.966312,0.485938,0.305031
3,0.0279,3.317528,0.485938,0.478261


TrainOutput(global_step=480, training_loss=0.0786415696144104, metrics={'train_runtime': 152.9395, 'train_samples_per_second': 50.216, 'train_steps_per_second': 3.138, 'total_flos': 1010346452582400.0, 'train_loss': 0.0786415696144104, 'epoch': 3.0})

In [53]:
trainer.state.log_history

[{'loss': 0.1575,
  'grad_norm': 60.087913513183594,
  'learning_rate': 3.34375e-05,
  'epoch': 1.0,
  'step': 160},
 {'eval_loss': 2.791598081588745,
  'eval_accuracy': 0.4953125,
  'eval_min_accuracy': 0.3522012578616352,
  'eval_runtime': 2.2587,
  'eval_samples_per_second': 283.354,
  'eval_steps_per_second': 17.71,
  'epoch': 1.0,
  'step': 160},
 {'loss': 0.0506,
  'grad_norm': 0.11308504641056061,
  'learning_rate': 1.6770833333333332e-05,
  'epoch': 2.0,
  'step': 320},
 {'eval_loss': 2.9663116931915283,
  'eval_accuracy': 0.4859375,
  'eval_min_accuracy': 0.3050314465408805,
  'eval_runtime': 2.2388,
  'eval_samples_per_second': 285.865,
  'eval_steps_per_second': 17.867,
  'epoch': 2.0,
  'step': 320},
 {'loss': 0.0279,
  'grad_norm': 0.011397858150303364,
  'learning_rate': 1.0416666666666667e-07,
  'epoch': 3.0,
  'step': 480},
 {'eval_loss': 3.317528486251831,
  'eval_accuracy': 0.4859375,
  'eval_min_accuracy': 0.4782608695652174,
  'eval_runtime': 2.2797,
  'eval_samples

In [None]:
history = trainer.state.log_history

# separate train and eval metrics
train_metrics = [h for h in history if "loss" in h and "epoch" in h and "eval_loss" not in h]
eval_metrics  = [h for h in history if "eval_loss" in h]

def extract_metrics(metrics_list):
    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": [], "min_accuracy": [], "loss":[]}
    for h in metrics_list:
        for key in metrics.keys():
            if key in h:
                metrics[key].append(h[key])
            elif key == "loss" and "loss" in h:
                metrics[key].append(h["loss"])
            else:
                metrics[key].append(None)
    return metrics

train_plot = extract_metrics(train_metrics)
val_plot   = extract_metrics(eval_metrics)

In [None]:
plt.figure(figsize=(16,12))

all_names = ["loss", "accuracy", "precision", "recall", "f1", "min_accuracy"]

for i, m in enumerate(all_names, 1):
    plt.subplot(3,2,i)
    plt.plot(train_plot[m], label=f"train {m}")
    plt.plot(val_plot[m], label=f"val {m}")
    plt.xlabel("Epoch")
    plt.ylabel(m)
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
def predict(texts):
    enc = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to(model.device)
    logits = model(**enc).logits
    preds = logits.argmax(dim=-1).cpu().numpy()
    return [id2label[int(p)] for p in preds]

print(predict([
    "אני מרגישה ממש רע היום",
    "איזה יום טוב היה לי היום"
]))

In [None]:
pred = trainer.predict(ds["test"])
logits = pred.predictions
labels = pred.label_ids
preds = logits.argmax(-1)

print("\n====== CLASSIFICATION REPORT (TEST) ======\n")
print(classification_report(labels, preds, target_names=list(id2label.values())))