# Fetch Results From MongoDB

In [9]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import pandas as pd

load_dotenv('/.env.local')


MONGO_URI = os.getenv("MONGODB_URI")
client = MongoClient(MONGO_URI)

db = client['lib']  
books_collection = db['books']
book_data_cursor = books_collection.find()

book_data_list = [{key: str(value) for key, value in book.items()} for book in book_data_cursor]
books_df = pd.DataFrame(book_data_list)

print(books_df.head(10))

                        _id                    title            author  \
0  670c5f6c5d5129e0921e7209            The Awakening       Kate Chopin   
1  670c5f6c5d5129e0921e7206      It Ends At Midnight      Harriet Tyce   
2  670c5f6c5d5129e0921e7234       The Midnight Feast        Lucy Foley   
3  670c5f6c5d5129e0921e720b              Above Water     Trish Kearney   
4  670c5f6c5d5129e0921e7210             Fall at Once       Nora Everly   
5  670c5f6c5d5129e0921e7208  The John Lennon Letters       John Lennon   
6  670c5f6c5d5129e0921e723e            Lady Midnight   Cassandra Clare   
7  670c5f6c5d5129e0921e7242      Middle of the Night       Riley Sager   
8  670c5f6c5d5129e0921e7228        Hello, Old Friend  Elizabeth Bedlam   
9  670c5f6c5d5129e0921e723a              The Pairing   Casey McQuiston   

      length                                              genre  \
0  208 pages  ['fiction', 'classics', 'reflective', 'slow-pa...   
1  320 pages  ['fiction', 'thriller', 'dark', 'myst

# Generate Prompt Completions For OpenAI Fine Tuning

Training Data With Descriptions

In [None]:
import json

db = client['lib']
books_collection = db['books'] 

def generate_prompt_completion(book):

    title = book.get('title', 'Unknown Title')
    author = book.get('author', 'Unknown Author')
    genres = ", ".join(book.get('genre', []))  
    pace_data = book.get('reviewData', {}).get('pace', {})
    themes_data = book.get('reviewData', {}).get('theme', {})
    
    genre_str = genres if genres else "any genre"
    pace_str = "fast-paced" if pace_data.get('fast', 0) > 10 else "medium-paced" if pace_data.get('medium', 0) > 10 else "slow-paced"
    theme_str = ", ".join([theme for theme, count in themes_data.items() if count > 10]) if themes_data else "any theme"
    
    prompt = f"I enjoy {theme_str} books that are {pace_str} and fall into {genre_str}."
    
    description = book.get('description', 'No description available.')
    completion = f"{title} by {author}. {description}"
    
    return {
        "prompt": prompt,
        "completion": completion
    }

book_data_cursor = books_collection.find()

prompt_completion_list = []

for book in book_data_cursor:
    prompt_completion = generate_prompt_completion(book)
    prompt_completion_list.append(prompt_completion)

with open("prompt_completion_pairs.jsonl", "w") as jsonl_file:
    for pair in prompt_completion_list:
        jsonl_file.write(json.dumps(pair) + "\n")

print(f"Generated {len(prompt_completion_list)} prompt-completion pairs.")

Training Data And Validation Data Generator

In [None]:
import json
import random
import os

db = client['lib']
books_collection = db['books'] 

def generate_messages(book):
    title = book.get('title', 'Unknown Title')
    author = book.get('author', 'Unknown Author')
    genres = ", ".join(book.get('genre', []))
    pace_data = book.get('reviewData', {}).get('pace', {})
    themes_data = book.get('reviewData', {}).get('theme', {})
    
    genre_str = genres if genres else "any genre"
    pace_str = "fast-paced" if pace_data.get('fast', 0) > 10 else "medium-paced" if pace_data.get('medium', 0) > 10 else "slow-paced"
    theme_str = ", ".join([theme for theme, count in themes_data.items() if count > 10]) if themes_data else "any theme"
    
    user_content = f"I enjoy {theme_str} books that are {pace_str} and fall into {genre_str}."
    
    assistant_content = f"{title} by {author}."
    
    return {
        "messages": [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]
    }

book_data_cursor = books_collection.find()

message_list = []

for book in book_data_cursor:
    messages = generate_messages(book)
    message_list.append(messages)

random.shuffle(message_list)

split_index = int(0.8 * len(message_list))
training_data = message_list[:split_index]
validation_data = message_list[split_index:]

output_folder = "src/data/trained_data"  

os.makedirs(output_folder, exist_ok=True)

training_file_path = os.path.join(output_folder, "training_data_no_desc.jsonl")
with open(training_file_path, "w") as training_file:
    for pair in training_data:
        training_file.write(json.dumps(pair) + "\n")

validation_file_path = os.path.join(output_folder, "validation_data_no_desc.jsonl")
with open(validation_file_path, "w") as validation_file:
    for pair in validation_data:
        validation_file.write(json.dumps(pair) + "\n")

print(f"Training data saved to {training_file_path}")
print(f"Validation data saved to {validation_file_path}")

# Importing Model From HuggingFace

## SBERT Model 
- load data from mongo into a pd data frame
- parse book data 
- load sbert model and create text embeddings
- write user query
- calc cos sim to find similar books

In [10]:
from pymongo import MongoClient
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

MONGO_URI = os.getenv("MONGODB_URI")
client = MongoClient(MONGO_URI)

db = client['lib']  
books_collection = db['books']
book_data_cursor = books_collection.find()

# Convert MongoDB cursor to DataFrame for easy manipulation
book_data_list = [{key: str(value) for key, value in book.items()} for book in book_data_cursor]
books_df = pd.DataFrame(book_data_list)

# Display a sample of the data
print(books_df.head(10))


  from tqdm.autonotebook import tqdm, trange


                        _id                    title            author  \
0  670c5f6c5d5129e0921e7209            The Awakening       Kate Chopin   
1  670c5f6c5d5129e0921e7206      It Ends At Midnight      Harriet Tyce   
2  670c5f6c5d5129e0921e7234       The Midnight Feast        Lucy Foley   
3  670c5f6c5d5129e0921e720b              Above Water     Trish Kearney   
4  670c5f6c5d5129e0921e7210             Fall at Once       Nora Everly   
5  670c5f6c5d5129e0921e7208  The John Lennon Letters       John Lennon   
6  670c5f6c5d5129e0921e723e            Lady Midnight   Cassandra Clare   
7  670c5f6c5d5129e0921e7242      Middle of the Night       Riley Sager   
8  670c5f6c5d5129e0921e7228        Hello, Old Friend  Elizabeth Bedlam   
9  670c5f6c5d5129e0921e723a              The Pairing   Casey McQuiston   

      length                                              genre  \
0  208 pages  ['fiction', 'classics', 'reflective', 'slow-pa...   
1  320 pages  ['fiction', 'thriller', 'dark', 'myst

In [30]:
import json

# Function to parse and format book data
def create_book_text(book):
    genres = ", ".join(book.get('genre', []))

    # Parse reviewData to ensure it's a dictionary
    try:
        review_data = json.loads(book.get('reviewData', '{}').replace("'", "\""))
        themes_data = review_data.get('theme', {})
        pace_data = review_data.get('pace', {})
    except (json.JSONDecodeError, AttributeError):
        themes_data = {}
        pace_data = {}

    theme_str = ", ".join([theme for theme, count in themes_data.items() if int(count) > 10]) if themes_data else "any theme"
    pace_str = "fast-paced" if int(pace_data.get('fast', 0)) > 10 else "medium-paced" if int(pace_data.get('medium', 0)) > 10 else "slow-paced"
    
    return f"{book['title']} by {book['author']}. Genre: {genres}. Themes: {theme_str}. Pace: {pace_str}. Description: {book['description']}."

# Apply formatting to each book in the DataFrame
books_df['text'] = books_df.apply(create_book_text, axis=1)


In [54]:
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the SBERT model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Initialize an empty list to store embeddings
book_embeddings = []

# Loop through each book text with a progress bar
for text in tqdm(books_df['text'].tolist(), desc="Generating Embeddings"):
    embedding = model.encode(text)
    book_embeddings.append(embedding)

# Convert list to an array
book_embeddings = np.array(book_embeddings)


Generating Embeddings: 100%|██████████| 109023/109023 [16:22<00:00, 110.98it/s]


In [84]:
# Example user input
likes = "harry potter"
dislikes = "Sorcerer's Stone"

# Generate embeddings for likes and dislikes
like_embedding = model.encode([likes])
dislike_embedding = model.encode([dislikes])

from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity of each book to the "likes" embedding
positive_similarities = cosine_similarity(like_embedding, book_embeddings).flatten()

# Calculate similarity of each book to the "dislikes" embedding
negative_similarities = cosine_similarity(dislike_embedding, book_embeddings).flatten()

# Adjust scores: subtract similarity to disliked books from similarity to liked books
final_scores = positive_similarities - (0.5 * negative_similarities)  # Adjust the weight (0.5) based on preference




In [85]:
# Get indices of the top 5 recommendations
top_indices = final_scores.argsort()[::-1][:5]

# Display the recommendations with scores
print("Top Book Recommendations:")
for idx in top_indices:
    title = books_df.iloc[idx]['title']
    author = books_df.iloc[idx]['author']
    score = final_scores[idx]
    print(f"Title: {title}, Author: {author}, Score: {score:.4f}")


Top Book Recommendations:
Title: Harry Potter and the Chamber of Secrets (Enhanced Edition), Author: J.K. Rowling, Score: 0.3959
Title: Harry Potter y la Piedra Filosofal, Author: J.K. Rowling, Score: 0.3925
Title: Harry Potter and the Chamber of Secrets, Author: J.K. Rowling, Score: 0.3917
Title: Harry Potter and the Chamber of Secrets, Author: J.K. Rowling, Score: 0.3917
Title: Harry Potter and the Order of the Phoenix, Author: J.K. Rowling, Score: 0.3846
