In [29]:
import json
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import hazm
from tqdm import tqdm
import torch
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader

# Check GPU Availability

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Text normalization function

In [21]:
normalizer = hazm.Normalizer()
def text_normalization(text):
    text = normalizer.normalize(text)

    return text

# Model and Tokenizer

In [4]:
model_id   = "sbunlp/fabert"
model_name = "FaBERT"
cache_dir  = f"./Model-FaBERT"

In [5]:
tokenizer = BertTokenizer.from_pretrained(
                                            model_id,
                                            cache_dir=cache_dir,
                                        )

# Load the model
model = BertForSequenceClassification.from_pretrained(
                                            model_id,
                                            cache_dir=cache_dir,
                                            # device_map=device,
                                            num_labels=2
                                        )

# model.to(device)  # Move the model to the GPU

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sbunlp/fabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def print_model_parameters(model):
    # Total number of parameters
    total_params = sum(p.numel() for p in model.parameters())
    # Trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")

# Example usage
print_model_parameters(model)

Total parameters: 124,442,882
Trainable parameters: 124,442,882


# Data Preparation

In [7]:
# Load JSON data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

In [9]:
base_path = "/media/amin/T7 SHIELD/Projects/Persian-Informal-Formal/Datasets/ParsMap"

train_data = load_data(f"{base_path}/ParsMap-train.json")
val_data   = load_data(f"{base_path}/ParsMap-val.json")
test_data  = load_data(f"{base_path}/ParsMap-test.json")

In [10]:
print(f'#of training samples: {len(train_data.items())}')
print('\n')
print(f'#of validation samples: {len(val_data.items())}')
print('\n')
print(f'#of testing samples: {len(test_data.items())}')
print('\n')

#of training samples: 35009


#of validation samples: 7502


#of testing samples: 7503




In [11]:
# Convert data into lists for binary classification
def prepare_data(data):
    inputs, labels = [], []
    for _, value in tqdm(data.items()):
        try:
            informal_text = value['inFormalForm']
            informal_text = text_normalization(informal_text)
            inputs.append(informal_text)
            labels.append(0)  # Add 0 for inFormalForm
        except Exception as e:
            print(f"An error occurred: {e}")


        try:
            formal_text  = value['formalForm']
            formal_text = text_normalization(formal_text)
            inputs.append(formal_text)
            labels.append(1)  # Add 1 for formalForm
        except Exception as e:
            print(f"An error occurred: {e}")

    return inputs, labels

In [12]:
train_texts, train_labels = prepare_data(train_data)
val_texts,   val_labels   = prepare_data(val_data)
test_texts,  test_labels  = prepare_data(test_data)

 45%|████▌     | 15800/35009 [00:02<00:03, 6166.33it/s]

An error occurred: 'float' object has no attribute 'translate'


100%|██████████| 35009/35009 [00:05<00:00, 6045.99it/s]
100%|██████████| 7502/7502 [00:01<00:00, 6045.56it/s]
100%|██████████| 7503/7503 [00:01<00:00, 6113.16it/s]


In [13]:
# Tokenize the data

def tokenize_data(texts, labels):
    tokenized = tokenizer(
        texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
    )
    tokenized["labels"] = torch.tensor(labels)
    return tokenized

In [14]:
train_encodings = tokenize_data(train_texts, train_labels)
val_encodings   = tokenize_data(val_texts, val_labels)
test_encodings  = tokenize_data(test_texts, test_labels)

In [15]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

# Prepare the Dataset

In [16]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

In [22]:
train_dataset = TextDataset(train_encodings)
val_dataset   = TextDataset(val_encodings)

# Trainer Setup

In [18]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
)



In [19]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [20]:
# Train the model
trainer.train()

  0%|          | 0/6567 [00:00<?, ?it/s]

{'loss': 0.0763, 'grad_norm': 0.01840325817465782, 'learning_rate': 4.6193086645347957e-05, 'epoch': 0.23}
{'loss': 0.0359, 'grad_norm': 0.12912239134311676, 'learning_rate': 4.238617329069591e-05, 'epoch': 0.46}
{'loss': 0.0357, 'grad_norm': 4.717024803161621, 'learning_rate': 3.857925993604386e-05, 'epoch': 0.69}
{'loss': 0.0233, 'grad_norm': 17.164199829101562, 'learning_rate': 3.4772346581391806e-05, 'epoch': 0.91}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.022439230233430862, 'eval_runtime': 27.9329, 'eval_samples_per_second': 537.144, 'eval_steps_per_second': 16.79, 'epoch': 1.0}
{'loss': 0.0188, 'grad_norm': 0.044183868914842606, 'learning_rate': 3.096543322673976e-05, 'epoch': 1.14}
{'loss': 0.0094, 'grad_norm': 0.005505237728357315, 'learning_rate': 2.715851987208771e-05, 'epoch': 1.37}
{'loss': 0.008, 'grad_norm': 0.0010621192632243037, 'learning_rate': 2.3351606517435665e-05, 'epoch': 1.6}
{'loss': 0.0132, 'grad_norm': 0.002697585616260767, 'learning_rate': 1.954469316278362e-05, 'epoch': 1.83}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.021635230630636215, 'eval_runtime': 27.9863, 'eval_samples_per_second': 536.119, 'eval_steps_per_second': 16.758, 'epoch': 2.0}
{'loss': 0.0102, 'grad_norm': 0.048293791711330414, 'learning_rate': 1.5737779808131566e-05, 'epoch': 2.06}
{'loss': 0.0032, 'grad_norm': 0.0008392877643927932, 'learning_rate': 1.1930866453479519e-05, 'epoch': 2.28}
{'loss': 0.0026, 'grad_norm': 0.04795552045106888, 'learning_rate': 8.123953098827471e-06, 'epoch': 2.51}
{'loss': 0.0033, 'grad_norm': 0.0014071812620386481, 'learning_rate': 4.317039744175423e-06, 'epoch': 2.74}
{'loss': 0.0038, 'grad_norm': 0.0016569598810747266, 'learning_rate': 5.101263895233745e-07, 'epoch': 2.97}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.022546423599123955, 'eval_runtime': 27.9352, 'eval_samples_per_second': 537.1, 'eval_steps_per_second': 16.789, 'epoch': 3.0}
{'train_runtime': 2036.2132, 'train_samples_per_second': 103.158, 'train_steps_per_second': 3.225, 'train_loss': 0.018555654765215795, 'epoch': 3.0}


TrainOutput(global_step=6567, training_loss=0.018555654765215795, metrics={'train_runtime': 2036.2132, 'train_samples_per_second': 103.158, 'train_steps_per_second': 3.225, 'total_flos': 1.0794285212778e+16, 'train_loss': 0.018555654765215795, 'epoch': 3.0})

# Evaluation

In [23]:
# Create a dataset for testing
test_dataset = TextDataset(test_encodings)

In [24]:
# Predict
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)

  0%|          | 0/469 [00:00<?, ?it/s]

In [26]:
# Generate classification report
print(classification_report(test_labels, preds.numpy(), target_names=["Mismatch", "Match"]))

              precision    recall  f1-score   support

    Mismatch       1.00      0.99      1.00      7503
       Match       0.99      1.00      1.00      7503

    accuracy                           1.00     15006
   macro avg       1.00      1.00      1.00     15006
weighted avg       1.00      1.00      1.00     15006



# Save the Model

In [27]:
model.save_pretrained("./fine_tuned_fabert")
tokenizer.save_pretrained("./fine_tuned_fabert")

('./fine_tuned_fabert/tokenizer_config.json',
 './fine_tuned_fabert/special_tokens_map.json',
 './fine_tuned_fabert/vocab.txt',
 './fine_tuned_fabert/added_tokens.json')

# Manual Training Loop

In [None]:
# Create DataLoader for training
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_loader:
        batch = {key: val.to(device) for key, val in batch.items()}  # Move batch to GPU
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}  # Move batch to GPU
            outputs = model(**batch)

# Manual Evaluation Loop

In [31]:
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader):
        batch = {key: val.to(device) for key, val in batch.items()}  # Move batch to GPU
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())

# Calculate metrics
print(classification_report(true_labels, predictions, target_names=["Mismatch", "Match"]))

100%|██████████| 938/938 [00:29<00:00, 31.97it/s]

              precision    recall  f1-score   support

    Mismatch       1.00      0.99      1.00      7503
       Match       0.99      1.00      1.00      7503

    accuracy                           1.00     15006
   macro avg       1.00      1.00      1.00     15006
weighted avg       1.00      1.00      1.00     15006




