In [1]:
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,confusion_matrix
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
# Load the dataset
data = pd.read_csv('filtered_data_equalRating_dist.csv')
data = data.dropna(subset=['review_text'])

# Define genres and sample 150 reviews per genre for balance
dfs = []
generos = ['children','comics_graphics','fantasy_paranormal','history_biography','mystery_thriller_crime','poetry','romance','young_adult']

for genero in generos:
    dfs.append(data[data['type'] == genero].sample(n=150, random_state=1))
df_sampled = pd.concat(dfs).reset_index(drop=True)

# Encode the target labels (categories/genres)
label_encoder = LabelEncoder()
df_sampled['label'] = label_encoder.fit_transform(df_sampled['type'])

# Check the mapping from labels to original categories
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function for reviews
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')

# Dataset class for BERT inputs
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        tokenized_review = tokenize_function(self.reviews.iloc[idx])
        
        input_ids = tokenized_review['input_ids'].squeeze(0)
        attention_mask = tokenized_review['attention_mask'].squeeze(0)

        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }
    
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_sampled['review_text'], df_sampled['label'], test_size=0.2, random_state=42)

# Create the training and testing datasets using the custom Dataset class
train_dataset = ReviewsDataset(X_train, y_train)
test_dataset = ReviewsDataset(X_test, y_test)

# Load pre-trained BERT model with a classification head
num_labels = len(df_sampled['label'].unique())  
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

{'children': 0, 'comics_graphics': 1, 'fantasy_paranormal': 2, 'history_biography': 3, 'mystery_thriller_crime': 4, 'poetry': 5, 'romance': 6, 'young_adult': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    evaluation_strategy='epoch',      # evaluate during training
    per_device_train_batch_size=8,    # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=3,               # number of training epochs
    weight_decay=0.01,                # strength of weight decay
)

# Metric computation function
def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)  
    predictions = torch.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics
)

# Train the model
trainer.train()

results = trainer.evaluate()
print(f"Evaluation results: {results}")

# Specify a directory to save the model
model_save_path = './saved_model'

# Save the model
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

{'children': 0, 'comics_graphics': 1, 'fantasy_paranormal': 2, 'history_biography': 3, 'mystery_thriller_crime': 4, 'poetry': 5, 'romance': 6, 'young_adult': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/360 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.794196605682373, 'eval_accuracy': 0.36666666666666664, 'eval_f1': 0.3348565141806734, 'eval_precision': 0.48654493376297897, 'eval_recall': 0.36666666666666664, 'eval_runtime': 999.6598, 'eval_samples_per_second': 0.24, 'eval_steps_per_second': 0.015, 'epoch': 1.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 1.4939755201339722, 'eval_accuracy': 0.4708333333333333, 'eval_f1': 0.4663887206782616, 'eval_precision': 0.48587698412698416, 'eval_recall': 0.4708333333333333, 'eval_runtime': 774.8525, 'eval_samples_per_second': 0.31, 'eval_steps_per_second': 0.019, 'epoch': 2.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 1.4710792303085327, 'eval_accuracy': 0.44166666666666665, 'eval_f1': 0.4406458548331614, 'eval_precision': 0.455843253968254, 'eval_recall': 0.44166666666666665, 'eval_runtime': 523.1339, 'eval_samples_per_second': 0.459, 'eval_steps_per_second': 0.029, 'epoch': 3.0}
{'train_runtime': 40697.8772, 'train_samples_per_second': 0.071, 'train_steps_per_second': 0.009, 'train_loss': 1.5699705335828993, 'epoch': 3.0}


  0%|          | 0/15 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 1.4710792303085327, 'eval_accuracy': 0.44166666666666665, 'eval_f1': 0.4406458548331614, 'eval_precision': 0.455843253968254, 'eval_recall': 0.44166666666666665, 'eval_runtime': 514.1665, 'eval_samples_per_second': 0.467, 'eval_steps_per_second': 0.029, 'epoch': 3.0}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [None]:
# Load model and tokenizer from saved directory
model_save_path = './saved_model'
model = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)

In [8]:
import pandas as pd


# Get predictions and true labels from the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()


# Extract evaluation metrics
metrics = ['accuracy', 'f1', 'precision', 'recall']
scores = [results[f'eval_{metric}'] for metric in metrics]

# Create a DataFrame to display metrics
metrics_df = pd.DataFrame({
    "Metric": metrics,
    "Score": scores
})

# Display the table
print(metrics_df)


NameError: name 'df_sampled' is not defined

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm  # for progress bar
import string
from nltk.corpus import stopwords

# Load NLTK stopwords (you may need to download the stopwords dataset first)
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Assume `model` and `tokenizer` are already loaded with BERT model and tokenizer
def get_aggregate_attention_weights(dataset, top_n=30):
    token_importance = {}
    
    for text in tqdm(dataset):
        inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
        input_ids = inputs['input_ids']
        outputs = model(input_ids, output_attentions=True)
        attentions = outputs.attentions
        last_layer_attention = attentions[-1].mean(dim=1).squeeze().detach().cpu().numpy()

        tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
        word_importance = last_layer_attention.sum(axis=0)

        current_word = ""
        current_importance = 0.0
        
        for token, importance in zip(tokens, word_importance):
            if token.startswith("##"):
                current_word += token[2:]
                current_importance += importance
            else:
                if current_word:
                    if current_word not in stop_words and current_word not in punctuation:
                        if current_word in token_importance:
                            token_importance[current_word] += current_importance
                        else:
                            token_importance[current_word] = current_importance
                
                current_word = token
                current_importance = importance

    most_informative_tokens = sorted(token_importance.items(), key=lambda x: x[1], reverse=True)
    print(f"Top {top_n} most informative words across the dataset (via attention):")
    for token, importance in most_informative_tokens[:top_n]:
        print(f"{token}: {importance:.4f}")

# Get the top 30 most informative words across the dataset
sample_reviews = df_sampled['review_text'].tolist()
get_aggregate_attention_weights(sample_reviews, top_n=30)


100%|██████████| 1200/1200 [01:36<00:00, 12.41it/s]

Top 30 most informative words across the dataset (via attention):
book: 1499.0237
[CLS]: 943.2516
read: 654.4401
story: 544.9651
one: 333.4082
characters: 306.5946
books: 304.4423
love: 265.8392
series: 263.4558
poetry: 242.0753
like: 233.4048
poems: 207.1577
reading: 203.2136
graphic: 186.1834
character: 179.2026
first: 167.2079
novel: 160.6282
writing: 149.8067
author: 144.9358
good: 141.9385
really: 140.4432
plot: 139.3752
would: 136.9780
romance: 136.0568
l: 135.9449
poem: 133.2470
de: 127.5320
little: 121.5794
life: 120.1989
fy: 118.5675





In [33]:
# Example of new reviews
new_reviews = ["This book was full of fantasy and magic.", 
               "The story was thrilling and action-packed."]

# Tokenize the new reviews
new_inputs = tokenizer(new_reviews, padding='max_length', truncation=True, return_tensors='pt')

# Predict the genre
outputs = model(**new_inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions back to genre labels
predicted_genres = label_encoder.inverse_transform(predictions)

# Display the predicted genres
for review, genre in zip(new_reviews, predicted_genres):
    print(f"Review: {review} -> Predicted Genre: {genre}")

Review: This book was full of fantasy and magic. -> Predicted Genre: fantasy_paranormal
Review: The story was thrilling and action-packed. -> Predicted Genre: fantasy_paranormal
