In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np


data = pd.read_csv("goodreads_data.csv").drop("Unnamed: 0", axis=1)
data = data.dropna().drop_duplicates()



model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Assuming binary classification for recommendations



max_length = 128  # Set the desired maximum sequence length
input_texts = data['Book']  # Use book names as input

# Tokenize input texts
input_encodings = tokenizer.batch_encode_plus(
    input_texts.tolist(),
    truncation=True,
    padding='longest',
    max_length=max_length,
    return_tensors='pt'
)



# Prepare input tensors
input_ids = input_encodings['input_ids']
attention_mask = input_encodings['attention_mask']
labels = torch.tensor(data['Label'])  # Assuming you have a 'Label' column indicating relevance or non-relevance

# Create a Torch dataset
dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # Set an appropriate learning rate
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 5  # Set the desired number of training epochs

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    # Evaluate on the validation set
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss / len(train_loader):.4f} - Val Loss: {val_loss / len(val_loader):.4f}")



    
def get_recommendations(book_name, top_k=5):
    model.eval()
    input_text = book_name
    input_encoding = tokenizer.encode_plus(
        input_text,
        truncation=True,
        padding='longest',
        max_length=max_length,
        return_tensors='pt'
    )
    input_ids = input_encoding['input_ids'].to(device)
    attention_mask = input_encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        _, predicted_labels = torch.topk(probabilities, top_k, dim=1)
    
    recommended_books = []
    for label in predicted_labels[0]:
        recommended_books.append(data.iloc[label.item()]["Book"])
    
    return recommended_books



book_name = "Book Name"  # Replace "Book Name" with the desired book name
recommended_books = get_recommendations(book_name)
print(f"Recommended books for {book_name}: {recommended_books}")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]