In [12]:
#authored by Tim (taken from 3/17/24)

In [1]:
!pip install transformers



In [13]:
import torch
import transformers as ppb
import pandas as pd
import numpy as np


# Install and init BERT model.

In [3]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# Process Inputs

In [4]:
import pandas as pd
from ast import literal_eval

books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})
books_df = books_df[:1000]

In [5]:
books_df['short_description'] = books_df.description.str.slice(0,512)

In [6]:
# TODO 1: @Swathi read the dataset from a Bucket or Google Drive
df = books_df.short_description

In [7]:
tokenized = df.apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

# Embed Text

In [9]:
device = torch.device('cuda')

torch.cuda.set_per_process_memory_fraction(0.70, torch.cuda.device(0))

input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
model = model.to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

OutOfMemoryError: CUDA out of memory. Tried to allocate 504.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 973.62 MiB is free. Including non-PyTorch memory, this process has 1014.00 MiB memory in use. Of the allocated memory 927.64 MiB is allocated by PyTorch, and 50.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
embeddings = last_hidden_states[0].cpu().data[:,0,:].numpy()

In [None]:
df.shape, embeddings.shape

((1000,), (1000, 768))

# Save Embeddings

In [None]:
embeddings_pd_df = pd.DataFrame({'book_id': range(df.shape[0])})
embeddings_pd_df['title'] = books_df['title']
embeddings_pd_df['embedding'] = embeddings.tolist()

In [None]:
# TODO 2: @Swathi save embeddings_pd_df to another bucket in JSON format
embeddings_pd_df['embedding'] = embeddings_pd_df['embedding'].apply(lambda x: np.array(x))

In [None]:
embeddings_pd_df.head()

Unnamed: 0,book_id,title,embedding
0,0,"The Hunger Games (The Hunger Games, #1)","[0.11946984380483627, 0.057541050016880035, 0...."
1,1,Harry Potter and the Sorcerer's Stone (Harry P...,"[-0.3747820258140564, -0.2721218168735504, 0.1..."
2,2,"Twilight (Twilight, #1)","[0.12502743303775787, -0.7245563268661499, -0...."
3,3,To Kill a Mockingbird,"[-0.4378758668899536, -0.29153937101364136, -0..."
4,4,The Great Gatsby,"[-0.4059891104698181, -0.47040677070617676, -0..."


# Train KNN

In [None]:
from sklearn.neighbors import NearestNeighbors

X = np.array(embeddings_pd_df['embedding'].values.tolist())
k = 5
nn_model = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(X)

# Query KNN

In [None]:
query = np.array([X[i] + np.random.normal(loc=0.0, scale=1.0, size=X[0].shape) for i in range(2)])

In [None]:
distances, indices = nn_model.kneighbors(query)

In [None]:
i = 0
for book_rec in indices:
    i += 1
    j = 0
    print(f"Top {k} recommendations for query {i}:")
    for index in book_rec:
        j += 1
        print(f"\t{j}. {embeddings_pd_df.iloc[index]['title']}")

Top 5 recommendations for query 1:
	1. The Hunger Games (The Hunger Games, #1)
	2. Graceling (Graceling Realm, #1)
	3. The Client
	4. The Pelican Brief
	5. A Court of Thorns and Roses (A Court of Thorns and Roses, #1)
Top 5 recommendations for query 2:
	1. Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
	2. A Discovery of Witches (All Souls Trilogy, #1)
	3. The Iron King (The Iron Fey, #1)
	4. City of Heavenly Fire (The Mortal Instruments, #6)
	5. Fruits Basket, Vol. 1
