In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Load dataset
dataset_path = 'dataset.csv'
df = pd.read_csv(dataset_path)

# Display dataset information
print("DataFrame Head:\n", df.head())
print("Columns:\n", df.columns)
print("Missing Values:\n", df.isnull().sum())

# Ensure 'Content_1' and 'Genre' columns are present
if 'Content_1' not in df.columns or 'Genre' not in df.columns:
    raise ValueError("Required columns 'Content_1' or 'Genre' are missing in the dataframe.")

# Drop rows with missing 'Content_1' or 'Genre'
df = df.dropna(subset=['Content_1', 'Genre'])

# Reset index to ensure proper indexing
df = df.reset_index(drop=True)

# Display the first few rows after cleaning
print("Cleaned DataFrame Head:\n", df.head())

# Encode labels
label_encoder = LabelEncoder()
df['Genre_encoded'] = label_encoder.fit_transform(df['Genre'])

# Display encoded labels
print("Encoded Labels:\n", df['Genre_encoded'].head())

# Split the dataset into train, validation, and test sets
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

# Check the splits
print(f"Training Set Size: {len(train_df)}")
print(f"Validation Set Size: {len(val_df)}")
print(f"Test Set Size: {len(test_df)}")

# Initialize ParsBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased')

# Custom dataset class for Persian summaries
class PersianMovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.reset_index(drop=True)  # Ensure proper indexing
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = PersianMovieGenreDataset(train_df['Content_1'], train_df['Genre_encoded'], tokenizer)
val_dataset = PersianMovieGenreDataset(val_df['Content_1'], val_df['Genre_encoded'], tokenizer)
test_dataset = PersianMovieGenreDataset(test_df['Content_1'], test_df['Genre_encoded'], tokenizer)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_df['Genre_encoded']), y=train_df['Genre_encoded'])
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Convert class weights to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Load the ParsBERT model
model = AutoModelForSequenceClassification.from_pretrained('HooshvareLab/bert-fa-base-uncased', num_labels=len(label_encoder.classes_))
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Define custom training arguments with class weights
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Ensure both evaluation and save strategy are the same
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

# Custom Trainer class to handle class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=None  # Add your custom metrics computation here
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate(test_dataset)
print(results)

# Save the model and tokenizer
model.save_pretrained('./parsbert_movie_genre_classifier')
tokenizer.save_pretrained('./parsbert_movie_genre_classifier')


DataFrame Head:
                                                 Link            EN_title  \
0  https://www.imvbox.com/watch-persian-movie-ira...   Local Anaesthetic   
1  https://www.imvbox.com/watch-persian-movie-ira...         Disturbance   
2  https://www.imvbox.com/watch-persian-movie-ira...           Highlight   
3  https://www.imvbox.com/watch-persian-movie-ira...               Gilda   
4  https://www.imvbox.com/watch-persian-movie-ira...  Atmosphere Station   

     PENGLISH_title   PERSIAN_title  \
0  Bi Hessie Mozeie    بی‌حسی موضعی   
1         Ashoftegi        آشفته گی   
2           Haylayt         هایلایت   
3            Geelda           گیلدا   
4  Istgahe Atmosfer  ایستگاه اتمسفر   

                                           Content_1  \
0  جلال‌، دانشجوی سابق رشته فلسفه، متوجه می‌شود خ...   
1  «آشفته‌گی» رئالیستی و اجتماعی نیست. یک فیلم اس...   
2  یک تصادف اتومبیل آدم‌هایی را در تقابل با هم قر...   
3  گیلدا ماجرای زنی به نام «گیلدا» را روایت می کن...   
4  این فیلم

  return bound(*args, **kwds)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/324 [00:00<?, ?it/s]

{'loss': 3.0678, 'grad_norm': 17.206127166748047, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.09}
{'loss': 3.0929, 'grad_norm': 16.972537994384766, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.19}
{'loss': 3.033, 'grad_norm': 15.99864673614502, 'learning_rate': 3e-06, 'epoch': 0.28}
{'loss': 2.8632, 'grad_norm': 16.496543884277344, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.37}
{'loss': 2.8659, 'grad_norm': 20.564659118652344, 'learning_rate': 5e-06, 'epoch': 0.46}
{'loss': 2.7995, 'grad_norm': 18.60843276977539, 'learning_rate': 6e-06, 'epoch': 0.56}
{'loss': 2.8122, 'grad_norm': 28.743831634521484, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.65}
{'loss': 2.8848, 'grad_norm': 18.807851791381836, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.74}
{'loss': 2.7629, 'grad_norm': 30.592498779296875, 'learning_rate': 9e-06, 'epoch': 0.83}
{'loss': 2.626, 'grad_norm': 20.519529342651367, 'learning_rate': 1e-05, 'epoch': 0.93}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 2.678762912750244, 'eval_runtime': 81.0766, 'eval_samples_per_second': 1.332, 'eval_steps_per_second': 0.173, 'epoch': 1.0}


KeyError: "The `metric_for_best_model` training argument is set to 'eval_f1', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']. Consider changing the `metric_for_best_model` via the TrainingArguments."