In [4]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import Dataset

# Load the dataset
data = pd.read_csv('filtered_data_.csv')
data = data.dropna(subset=['review_text'])

# Encode the target labels (categories/genres)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['type'])

# Check the mapping from labels to original categories
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

# Sample a smaller subset of the data
data_sampled = data.sample(frac=0.1, random_state=42)  # Use 10% of the data

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the reviews
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')

# Define Trainer object
# Ensure labels are of type torch.long
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        # Tokenize the review text at each index
        tokenized_review = tokenize_function(self.reviews.iloc[idx])
        
        # Extract input_ids and attention_mask from the tokenized review
        input_ids = tokenized_review['input_ids'].squeeze(0)
        attention_mask = tokenized_review['attention_mask'].squeeze(0)

        # Get the corresponding label for the review and ensure it's of type torch.long
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }
    
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_sampled['review_text'], data_sampled['label'], test_size=0.2, random_state=42)

# Create the training and testing datasets using the custom Dataset class
train_dataset = ReviewsDataset(X_train, y_train)
test_dataset = ReviewsDataset(X_test, y_test)

# Load pre-trained BERT model with a classification head (set the number of classes)
num_labels = len(data_sampled['label'].unique())  # Number of unique genres
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    evaluation_strategy='epoch',      # evaluate during training
    per_device_train_batch_size=8,    # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=3,               # number of training epochs
    weight_decay=0.01,                # strength of weight decay
)

def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)  # Convert logits to tensor if they are not already
    predictions = torch.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics
)

# Train the model
trainer.train()

results = trainer.evaluate()
print(f"Evaluation results: {results}")

{'children': 0, 'comics_graphics': 1, 'fantasy_paranormal': 2, 'history_biography': 3, 'mystery_thriller_crime': 4, 'poetry': 5, 'romance': 6, 'young_adult': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.1424968242645264, 'eval_accuracy': 0.58875, 'eval_f1': 0.5750773140505129, 'eval_precision': 0.6081498281139622, 'eval_recall': 0.58875, 'eval_runtime': 1559.9385, 'eval_samples_per_second': 0.513, 'eval_steps_per_second': 0.032, 'epoch': 1.0}
{'loss': 1.3574, 'grad_norm': 20.08745574951172, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.128549337387085, 'eval_accuracy': 0.61875, 'eval_f1': 0.626794372425924, 'eval_precision': 0.6528190632928826, 'eval_recall': 0.61875, 'eval_runtime': 1404.6505, 'eval_samples_per_second': 0.57, 'eval_steps_per_second': 0.036, 'epoch': 2.0}
{'loss': 0.7288, 'grad_norm': 23.136972427368164, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.2547473907470703, 'eval_accuracy': 0.60625, 'eval_f1': 0.609941430014343, 'eval_precision': 0.6159098158657481, 'eval_recall': 0.60625, 'eval_runtime': 1433.6207, 'eval_samples_per_second': 0.558, 'eval_steps_per_second': 0.035, 'epoch': 3.0}
{'train_runtime': 62429.4307, 'train_samples_per_second': 0.154, 'train_steps_per_second': 0.019, 'train_loss': 0.9554873402913412, 'epoch': 3.0}


  0%|          | 0/50 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 1.2547473907470703, 'eval_accuracy': 0.60625, 'eval_f1': 0.609941430014343, 'eval_precision': 0.6159098158657481, 'eval_recall': 0.60625, 'eval_runtime': 1454.0888, 'eval_samples_per_second': 0.55, 'eval_steps_per_second': 0.034, 'epoch': 3.0}


In [6]:
import pandas as pd

# Extract evaluation metrics
metrics = ['accuracy', 'f1', 'precision', 'recall']
scores = [results[f'eval_{metric}'] for metric in metrics]

# Create a DataFrame to display metrics
metrics_df = pd.DataFrame({
    "Metric": metrics,
    "Score": scores
})

# Display the table
print(metrics_df)


      Metric     Score
0   accuracy  0.606250
1         f1  0.609941
2  precision  0.615910
3     recall  0.606250


In [7]:
# Example of new reviews
new_reviews = ["This book was full of fantasy and magic.", 
               "The story was thrilling and action-packed."]

# Tokenize the new reviews
new_inputs = tokenizer(new_reviews, padding='max_length', truncation=True, return_tensors='pt')

# Predict the genre
outputs = model(**new_inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions back to genre labels
predicted_genres = label_encoder.inverse_transform(predictions)

# Display the predicted genres
for review, genre in zip(new_reviews, predicted_genres):
    print(f"Review: {review} -> Predicted Genre: {genre}")

Review: This book was full of fantasy and magic. -> Predicted Genre: fantasy_paranormal
Review: The story was thrilling and action-packed. -> Predicted Genre: comics_graphics
