In [9]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import numpy as np
import os

df = pd.read_csv('dataset.csv')

# Handle class imbalance
df_majority = df[df.Genre == 'Drama']
df_minority = df[df.Genre != 'Drama']
df_majority_downsampled = resample(df_majority, replace=False, n_samples=200, random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority])
genre_counts = df_balanced['Genre'].value_counts()
df_filtered = df_balanced[df_balanced['Genre'].isin(genre_counts[genre_counts >= 10].index)]

class MovieDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased')
model = BertForSequenceClassification.from_pretrained('HooshvareLab/bert-fa-base-uncased', num_labels=len(genre_counts[genre_counts >= 10]))

# Encode labels
label_encoder = LabelEncoder()
df_filtered['Genre'] = label_encoder.fit_transform(df_filtered['Genre'])

# Define cross-validation
skf = StratifiedKFold(n_splits=5)

def train_and_evaluate(train_index, test_index, fold_num, device):
    train_df = df_filtered.iloc[train_index]
    test_df = df_filtered.iloc[test_index]

    train_dataset = MovieDataset(
        texts=train_df['Content_1'].tolist(),
        labels=train_df['Genre'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    test_dataset = MovieDataset(
        texts=test_df['Content_1'].tolist(),
        labels=test_df['Genre'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/fold_{fold_num}',
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/fold_{fold_num}',
        logging_steps=10,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=5e-5,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to='none',
        no_cuda=False if torch.cuda.is_available() else True
    )

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    results = trainer.evaluate(test_dataset)

    # Predict on the test set
    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = predictions.argmax(axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate F1 score
    f1 = f1_score(labels, predictions, average='macro')

    print(f'Fold {fold_num} Accuracy: {accuracy:.4f}')
    print(f'Fold {fold_num} F1 Score: {f1:.4f}')
    print(classification_report(labels, predictions, target_names=label_encoder.classes_))

    return accuracy, f1

# Cross-validation loop
accuracies = []
f1_scores = []

# Assuming 4 GPUs
num_gpus = 4

for fold_num, (train_index, test_index) in enumerate(skf.split(df_filtered, df_filtered['Genre'])):
    device = f'cuda:{fold_num % num_gpus}' if torch.cuda.is_available() else 'cpu'
    os.environ["CUDA_VISIBLE_DEVICES"] = str(fold_num % num_gpus)
    
    accuracy, f1 = train_and_evaluate(train_index, test_index, fold_num, device)
    accuracies.append(accuracy)
    f1_scores.append(f1)

# Print overall results
print(f'Overall Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')
print(f'Overall F1 Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Genre'] = label_encoder.fit_transform(df_filtered['Genre'])


  0%|          | 0/210 [00:00<?, ?it/s]

{'loss': 2.503, 'grad_norm': 8.951262474060059, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.24}
{'loss': 2.4079, 'grad_norm': 10.533915519714355, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.48}
{'loss': 2.3248, 'grad_norm': 8.415565490722656, 'learning_rate': 3e-06, 'epoch': 0.71}
{'loss': 2.1792, 'grad_norm': 8.105627059936523, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.95}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 2.1379523277282715, 'eval_runtime': 49.0489, 'eval_samples_per_second': 3.425, 'eval_steps_per_second': 0.224, 'epoch': 1.0}
{'loss': 2.0616, 'grad_norm': 7.097240447998047, 'learning_rate': 5e-06, 'epoch': 1.19}
{'loss': 2.0096, 'grad_norm': 5.586761951446533, 'learning_rate': 6e-06, 'epoch': 1.43}
{'loss': 1.9822, 'grad_norm': 6.465604305267334, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.67}
{'loss': 2.0788, 'grad_norm': 7.430798053741455, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.9}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 2.022489309310913, 'eval_runtime': 58.7619, 'eval_samples_per_second': 2.859, 'eval_steps_per_second': 0.187, 'epoch': 2.0}
{'loss': 1.9455, 'grad_norm': 6.477018356323242, 'learning_rate': 9e-06, 'epoch': 2.14}
{'loss': 1.7996, 'grad_norm': 6.047166347503662, 'learning_rate': 1e-05, 'epoch': 2.38}
{'loss': 1.7935, 'grad_norm': 6.790184020996094, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.62}
{'loss': 1.9439, 'grad_norm': 6.460014820098877, 'learning_rate': 1.2e-05, 'epoch': 2.86}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 2.0216903686523438, 'eval_runtime': 44.4688, 'eval_samples_per_second': 3.778, 'eval_steps_per_second': 0.247, 'epoch': 3.0}
{'loss': 1.8781, 'grad_norm': 6.207584857940674, 'learning_rate': 1.3000000000000001e-05, 'epoch': 3.1}
{'loss': 1.7274, 'grad_norm': 6.524002552032471, 'learning_rate': 1.4000000000000001e-05, 'epoch': 3.33}
{'loss': 1.8658, 'grad_norm': 8.543453216552734, 'learning_rate': 1.5e-05, 'epoch': 3.57}
{'loss': 1.7447, 'grad_norm': 7.734450340270996, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.81}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 2.0248477458953857, 'eval_runtime': 50.0144, 'eval_samples_per_second': 3.359, 'eval_steps_per_second': 0.22, 'epoch': 4.0}
{'loss': 1.6684, 'grad_norm': 10.86490249633789, 'learning_rate': 1.7000000000000003e-05, 'epoch': 4.05}
{'loss': 1.5267, 'grad_norm': 8.079270362854004, 'learning_rate': 1.8e-05, 'epoch': 4.29}
{'loss': 1.5364, 'grad_norm': 8.31192684173584, 'learning_rate': 1.9e-05, 'epoch': 4.52}
{'loss': 1.5886, 'grad_norm': 12.952197074890137, 'learning_rate': 2e-05, 'epoch': 4.76}
{'loss': 1.5557, 'grad_norm': 10.949078559875488, 'learning_rate': 2.1e-05, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 1.9958999156951904, 'eval_runtime': 39.7383, 'eval_samples_per_second': 4.228, 'eval_steps_per_second': 0.277, 'epoch': 5.0}
{'train_runtime': 2970.3065, 'train_samples_per_second': 1.13, 'train_steps_per_second': 0.071, 'train_loss': 1.9105500947861445, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Fold 0 Accuracy: 0.3214
Fold 0 F1 Score: 0.0876
                          precision    recall  f1-score   support

                  Action       0.22      0.13      0.17        15
               Adventure       0.00      0.00      0.00        10
       Arts & Literature       0.00      0.00      0.00         7
                  Comedy       0.35      0.74      0.47        53
                   Crime       1.00      0.07      0.13        14
    Culture & Traditions       0.00      0.00      0.00         5
                   Drama       0.26      0.30      0.28        40
                  Family       0.00      0.00      0.00         5
                 History       0.00      0.00      0.00         3
Human Interest & Society       0.00      0.00      0.00         9
                 Romance       0.00      0.00      0.00         3
                     War       0.00      0.00      0.00         4

                accuracy                           0.32       168
               macro avg  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  0%|          | 0/210 [00:00<?, ?it/s]

{'loss': 1.5845, 'grad_norm': 7.738523006439209, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.24}
{'loss': 1.5958, 'grad_norm': 9.2791109085083, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.48}
{'loss': 1.4725, 'grad_norm': 11.066917419433594, 'learning_rate': 3e-06, 'epoch': 0.71}
{'loss': 1.3508, 'grad_norm': 6.8538055419921875, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.95}


  0%|          | 0/11 [00:00<?, ?it/s]

KeyboardInterrupt: 