In [23]:
#Importing necessary libraries
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd
import tensorflow as tf
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, LeakyReLU
from sklearn.model_selection import train_test_split 
#Loading the dataset
data = pd.read_csv('filtered_data_.csv')

#Pre-Processing the text 
def cleaning(df, stop_words):
    #Put every character in lower case
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replacing the digits/numbers
    df['review_text'] = df['review_text'].str.replace('\d', '')
    # Removing stop words
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatization
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df

data = data.dropna(subset=['review_text'])
stop_words = stopwords.words('english')
data_cleaned = cleaning(data, stop_words)

#Generating Embeddings using tokenizer
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_cleaned['review_text'].values)
X = tokenizer.texts_to_sequences(data_cleaned['review_text'].values)
X = pad_sequences(X)

#'type' is the target column representing genres or categories
y = data_cleaned['type']

# Encode the target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building
model = Sequential()
model.add(Embedding(500, 120, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352))
model.add(LeakyReLU())
model.add(Dense(len(le.classes_), activation='softmax'))  # Output layer with number of classes
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Model Training
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Model Testing
model.evaluate(X_test, y_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmlim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jmlim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


None
Epoch 1/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m875s[0m 873ms/step - accuracy: 0.2869 - loss: 1.8627
Epoch 2/20
[1m 439/1000[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m8:03[0m 861ms/step - accuracy: 0.4333 - loss: 1.5354

KeyboardInterrupt: 

In [3]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import Dataset

# Load the dataset
data = pd.read_csv('filtered_data_.csv')
data = data.dropna(subset=['review_text'])

# Encode the target labels (categories/genres)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['type'])

# Check the mapping from labels to original categories
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

# Sample a smaller subset of the data
data_sampled = data.sample(frac=0.001, random_state=42)  # Use 10% of the data

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the reviews
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')

# Define Trainer object
# Ensure labels are of type torch.long
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        # Tokenize the review text at each index
        tokenized_review = tokenize_function(self.reviews.iloc[idx])
        
        # Extract input_ids and attention_mask from the tokenized review
        input_ids = tokenized_review['input_ids'].squeeze(0)
        attention_mask = tokenized_review['attention_mask'].squeeze(0)

        # Get the corresponding label for the review and ensure it's of type torch.long
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }
    
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_sampled['review_text'], data_sampled['label'], test_size=0.2, random_state=42)

# Create the training and testing datasets using the custom Dataset class
train_dataset = ReviewsDataset(X_train, y_train)
test_dataset = ReviewsDataset(X_test, y_test)

# Load pre-trained BERT model with a classification head (set the number of classes)
num_labels = len(data_sampled['label'].unique())  # Number of unique genres
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    evaluation_strategy='epoch',      # evaluate during training
    per_device_train_batch_size=8,    # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=3,               # number of training epochs
    weight_decay=0.01,                # strength of weight decay
)

def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)  # Convert logits to tensor if they are not already
    predictions = torch.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics
)

# Train the model
trainer.train()

results = trainer.evaluate()
print(f"Evaluation results: {results}")

{'children': 0, 'comics_graphics': 1, 'fantasy_paranormal': 2, 'history_biography': 3, 'mystery_thriller_crime': 4, 'poetry': 5, 'romance': 6, 'young_adult': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.011652708053589, 'eval_accuracy': 0.125, 'eval_f1': 0.027777777777777776, 'eval_precision': 0.015625, 'eval_recall': 0.125, 'eval_runtime': 12.8552, 'eval_samples_per_second': 0.622, 'eval_steps_per_second': 0.078, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.0787405967712402, 'eval_accuracy': 0.125, 'eval_f1': 0.027777777777777776, 'eval_precision': 0.015625, 'eval_recall': 0.125, 'eval_runtime': 13.5706, 'eval_samples_per_second': 0.59, 'eval_steps_per_second': 0.074, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.082810640335083, 'eval_accuracy': 0.125, 'eval_f1': 0.027777777777777776, 'eval_precision': 0.015625, 'eval_recall': 0.125, 'eval_runtime': 14.3358, 'eval_samples_per_second': 0.558, 'eval_steps_per_second': 0.07, 'epoch': 3.0}
{'train_runtime': 554.6337, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 1.9265201886494954, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 2.082810640335083, 'eval_accuracy': 0.125, 'eval_f1': 0.027777777777777776, 'eval_precision': 0.015625, 'eval_recall': 0.125, 'eval_runtime': 14.4892, 'eval_samples_per_second': 0.552, 'eval_steps_per_second': 0.069, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Example of new reviews
new_reviews = ["This book was full of fantasy and magic.", 
               "The story was thrilling and action-packed."]

# Tokenize the new reviews
new_inputs = tokenizer(new_reviews, padding='max_length', truncation=True, return_tensors='pt')

# Predict the genre
outputs = model(**new_inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions back to genre labels
predicted_genres = label_encoder.inverse_transform(predictions)

# Display the predicted genres
for review, genre in zip(new_reviews, predicted_genres):
    print(f"Review: {review} -> Predicted Genre: {genre}")

Review: This book was full of fantasy and magic. -> Predicted Genre: fantasy_paranormal
Review: The story was thrilling and action-packed. -> Predicted Genre: fantasy_paranormal
