In [None]:
# import pandas as pd
# import numpy as np

# book_df = pd.read_csv("data/books.csv")
# book_df['book_id'] = book_df['book_id'].astype('UInt32')
# book_df['review_count'] = book_df['review_count'].astype('UInt32')
# book_df['num_pages'] = book_df['num_pages'].astype('UInt16')
# book_df['year'] = book_df['year'].astype('Int16')
# star_cols = [col for col in book_df.columns if col.endswith('star')]
# for col in star_cols:
#     book_df[col] = book_df[col].astype('UInt32')

# test = book_df[~book_df['similar_books'].isna()]
# test = test[~test['genres'].isna()]
# test.to_csv('data/books.csv', index=False)

In [None]:
# Load and prep data
import pandas as pd
import numpy as np

books_df = pd.read_csv("data/books.csv")
numeric_map = {
    'book_id': 'UInt32',
    'review_count': 'UInt32',
    'num_pages': 'UInt16',
    'year': 'Int16',
}
for col, dtype in numeric_map.items():
    books_df[col] = books_df[col].astype(dtype)

star_cols = [c for c in books_df.columns if c.endswith('star')]
for col in star_cols:
    books_df[col] = books_df[col].astype('UInt32')    
# books_df['rating_count'] = books_df[star_cols].sum(axis=1)


# Todo: Fix the crawler to split langs with "|" by default
books_df['lang'] = [
    "|".join(item.strip() for item in x.split(";")) if isinstance(x, str) else x 
    for x in books_df['lang']
]
books_df['description'] = books_df['description'].str.replace('\n\n', '\n')
books_df['description'] = books_df['description'].str.replace('\n', ' ')

# Goodreads export
goodreads_df = pd.read_csv('data/goodreads_library_export.csv')
goodreads_df['book_id'] = goodreads_df['book_id'].astype('UInt32')
goodreads_df['my_rating'] = goodreads_df['my_rating'].astype('UInt8')

books_df = books_df.merge(goodreads_df[['book_id', 'my_rating']], on='book_id', how='left')
books_df['my_rating'] = books_df['my_rating'].replace(0, np.nan)

# books_df = books_df[~books_df['genres'].isna()].reset_index(drop=True)
books_df = books_df[~books_df['similar_books'].isna()].reset_index(drop=True)

In [None]:
# Prep embedding strings
def format_string_for_embedding(items, kind=None, truncate=0):
    if not isinstance(items, (list)) or len(items) == 0:
        return ""

    n = len(items)
    if n == 1:
        res = items[0]
    elif n > truncate > 1:
        res = f"{', '.join(items[:truncate])}, and {items[truncate]}"
    else:
        res = f"{', '.join(items[:-1])}{',' if n > 2 else ''} and {items[-1]}"
    
    prefix = f"{kind.capitalize()}{'s' if n > 1 else ''}: " if kind else ""
    return f"{prefix}{res}"

books_df['authors_post'] = books_df['authors'].str.split('|')
books_df['authors_post'] = books_df['authors_post'].apply(lambda x:format_string_for_embedding(x, truncate=4))

books_df['genres_post'] = books_df['genres'].str.split('|')
books_df['genres_post'] = books_df['genres_post'].apply(lambda x:format_string_for_embedding(x, kind='genre'))

# books_df['lang_post'] = books_df['lang'].str.split('|')
# books_df['lang_post'] = books_df['lang_post'].apply(lambda x:format_string_for_embedding(x, kind='language'))

books_df['desc_post'] = [[desc] if isinstance(desc, str) else [] for desc in books_df['description']]
books_df['desc_post'] = books_df['desc_post'].apply(lambda x:format_string_for_embedding(x, kind='description'))

def join_embedding_parts(title, authors, genres, desc):
    text = f"Book: {title}\n"
    if authors:
        text += f"Written by: {authors}\n"
    if genres:
        text += f"{genres}\n"
    if desc:
        text += f"{desc}" 
    return text

embedding_strings = [
    join_embedding_parts(t, a, g, d) 
    for t, a, g, d in zip(books_df['title'], books_df['authors_post'], books_df['genres_post'], books_df['desc_post'])
]

In [None]:
# # Genre embeddings
# import ollama
# from tqdm import tqdm
# import torch

# OLLAMA_MODEL = "qwen3-embedding:0.6b"
# MIN_DIMENSIONS = 32
# all_genres = books_df['genres'].str.split('|').explode().str.strip().unique()
# all_genres = [g for g in all_genres if isinstance(g, str) and len(g) > 0]

# my_rating_count = (~books_df['my_rating'].isna()).sum()
# sent_embedding_dimensions = MIN_DIMENSIONS
# while sent_embedding_dimensions*2 < my_rating_count:
#     sent_embedding_dimensions*=2

# BATCH_SIZE = 64
# genre_vectors = {}
# for i in tqdm(range(0, len(all_genres), BATCH_SIZE)):
#     batch = all_genres[i : i + BATCH_SIZE]
#     response = ollama.embed(model=OLLAMA_MODEL, input=batch)
#     for genre, vector in zip(batch, response['embeddings']):
#         genre_vectors[genre] = np.array(vector)[:sent_embedding_dimensions]

In [None]:
# Embed sentences
import ollama
from tqdm import tqdm
import torch

OLLAMA_MODEL = "qwen3-embedding:0.6b"
MIN_DIMENSIONS = 32

my_rating_count = (~books_df['my_rating'].isna()).sum()
sent_embedding_dimensions = MIN_DIMENSIONS
while sent_embedding_dimensions*2 < my_rating_count:
    sent_embedding_dimensions*=2

def get_batch_embeddings(text_list, model, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i : i + batch_size]
        response = ollama.embed(model=model, input=batch)
        embeddings.extend(response['embeddings'])
    return np.array(embeddings)

embeddings = get_batch_embeddings(embedding_strings, OLLAMA_MODEL)
embeddings = torch.tensor(embeddings, dtype=torch.float32)

In [None]:
# Build adjacency matrix
id_to_idx = {id: i for i, id in enumerate(books_df['book_id'])}

edge_indices = []
for idx, row in tqdm(books_df.iterrows(), total=len(books_df)):
    current_idx = id_to_idx[row['book_id']]
    if pd.isna(row['similar_books']):
        continue
    for item in row['similar_books'].split('|'):
        try:
            target_id = int(item.split(':')[0])
            if target_id in books_df['book_id'].values:
                target_idx = id_to_idx[target_id]
                edge_indices.append([current_idx, target_idx])
                edge_indices.append([target_idx, current_idx])
        except (ValueError, IndexError):
            continue

if not edge_indices:
    edge_index = torch.tensor([[], []], dtype=torch.long)
else:
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()


In [None]:
# Multiplying embeddings with adjacency matrix
from torch_geometric.utils import add_self_loops
from torch_geometric.nn.conv.gcn_conv import gcn_norm

edge_index_with_loops, _ = add_self_loops(edge_index, num_nodes=embeddings.size(0))
edge_index_norm, edge_weight_norm = gcn_norm(edge_index_with_loops, num_nodes=embeddings.size(0))
adj_matrix = torch.sparse_coo_tensor(edge_index_norm, edge_weight_norm, (embeddings.size(0), embeddings.size(0)))

final_embeddings = embeddings
num_propagations = 1
for _ in range(num_propagations):
    final_embeddings = torch.sparse.mm(adj_matrix, final_embeddings)
final_embeddings = final_embeddings.numpy()

In [None]:
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel

# Global score
C = books_df['avg_rating'].mean()
m = books_df['rating_count'].quantile(0.10) 
def weighted_rating(x, m=m, C=C):
    v = float(x['rating_count'])
    R = float(x['avg_rating'])
    if v == 0: 
        return C
    return (v / (v + m) * R) + (m / (v + m) * C)
books_df['global_score'] = books_df.apply(weighted_rating, axis=1)

# Dimensionality Reduction
y_umap = books_df['my_rating'].fillna(-1).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(final_embeddings)
reducer = umap.UMAP(
    n_neighbors=15,
    n_components=sent_embedding_dimensions//2,
    metric='cosine', 
    target_metric='l1',
    target_weight=0.5,
    random_state=42
)
X_reduced = reducer.fit_transform(X_scaled, y=y_umap)

# Gaussian Process Regression
train_mask = books_df['my_rating'].notna() & (books_df['my_rating'] > 0)
if train_mask.sum() < 5:
    print("Warning: Not enough ratings for GPR. Need at least 5.")
else:
    X_train = X_reduced[train_mask]
    y_train = books_df.loc[train_mask, 'my_rating'].values
    kernel = Matern(nu=1.5) + WhiteKernel(noise_level=0.1)
    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True, n_restarts_optimizer=2)
    gpr.fit(X_train, y_train)
    means, stds = gpr.predict(X_reduced, return_std=True)
    books_df['pred_rating'] = means
    books_df['uncertainty'] = stds


books_df['safe_personal_score'] = books_df['pred_rating'] - (books_df['uncertainty'] * 0.5)
books_df['final_score'] = (0.5 * books_df['safe_personal_score']) + (0.5 * books_df['global_score'])    
books_df[~train_mask][['book_id', 'title', 'my_rating', 'final_score', 'pred_rating', 'uncertainty', 'genres']].sort_values(by='final_score', ascending=False)

In [None]:
books_df[~train_mask][['book_id', 'title', 'my_rating', 'final_score', 'pred_rating', 'uncertainty', 'genres']].sort_values(by='final_score', ascending=False)

In [None]:
# from node2vec import Node2Vec
# import networkx as nx

# G = nx.Graph()
# G.add_nodes_from(books_df['book_id'].tolist())

# for idx, row in books_df.iterrows():
#     if pd.isna(row['similar_books']): continue
#     for item in row['similar_books'].split('|'):
#         try:
#             target_id = item.split(':')[0]
#             if target_id in books_df['book_id'].values:
#                 G.add_edge(row['book_id'], target_id, weight=1.0)
#         except ValueError: continue
        
# node2vec = Node2Vec(G, dimensions=64, walk_length=80, num_walks=100, workers=8, quiet=True)
# node2vec = Node2Vec(
#     G, 
#     dimensions=max(MIN_DIMENSIONS, sent_embedding_dimensions//2),
#     walk_length=len(books_df)//50,
#     num_walks=100,
#     p=1.0,
#     q=0.5,
#     workers=8, 
#     quiet=True
# )
# model_n2v = node2vec.fit(window=10, min_count=1)

# graph_embeddings = np.array([
#     model_n2v.wv[bid] if bid in model_n2v.wv else np.zeros(64) 
#     for bid in books_df['book_id']
# ])

In [None]:
import networkx as nx

G = nx.Graph()
G.add_nodes_from(books_df['book_id'].tolist())

# 2. Build edges with correct type casting
edge_list = []
for idx, row in books_df.iterrows():
    if pd.isna(row['similar_books']): continue
    
    for item in row['similar_books'].split('|'):
        target_id_str = item.split(':')[0]
        target_id = int(target_id_str)
        
        if target_id in books_df['book_id'].values:
            edge_list.append((row['book_id'], target_id))

G.add_edges_from(edge_list)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

# Prepare Node Features (Genre Embeddings) and Edge Index
node_features = torch.tensor(np.stack(genre_embeddings.values), dtype=torch.float)
node_mapping = {node: i for i, node in enumerate(books_df['book_id'].tolist())}
edge_indices = []

for u, v in G.edges():
    if u in node_mapping and v in node_mapping:
        edge_indices.append([node_mapping[u], node_mapping[v]])
        edge_indices.append([node_mapping[v], node_mapping[u]]) # Undirected

if len(edge_indices) == 0:
    edge_index = torch.tensor([[], []], dtype=torch.long)
else:
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

# Define GraphSAGE Model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # Layer 1
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        # Layer 2
        x = self.conv2(x, edge_index)
        # Optional: L2 Normalize final embeddings
        x = F.normalize(x, p=2, dim=1)
        return x

# Generate Embeddings
model = GraphSAGE(in_channels=node_features.shape[1], hidden_channels=sent_embedding_dimensions, out_channels=sent_embedding_dimensions)
model.eval()

with torch.no_grad():
    graph_embeddings_tensor = model(node_features, edge_index)

final_embeddings = graph_embeddings_tensor.numpy()
print("Embeddings shape:", final_embeddings.shape)

In [None]:
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel
import umap

C = books_df['avg_rating'].mean()
m = books_df['review_count'].quantile(0.10) # 10th percentile as minimum votes

def weighted_rating(x, m=m, C=C):
    v = x['review_count']
    R = x['avg_rating']
    if v == 0: return C
    return (v/(v+m) * R) + (m/(v+m) * C)

books_df['global_score'] = books_df.apply(weighted_rating, axis=1)

# PART C: The Personal Score (UMAP + GPR)

# 1. Semi-Supervised UMAP
# We need a target array. -1 signals "unlabeled" to UMAP.
# This warps the space so your rated books clump together based on score.
y_umap = books_df['my_rating'].fillna(-1).values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(final_embeddings)

reducer = umap.UMAP(
    n_neighbors=15,
    n_components=sent_embedding_dimensions,
    metric='cosine', 
    target_metric='l1', # Use L1 for the ratings
    # target_weight=0.5,  # Balance structural shape vs rating shape
    random_state=42
)

print("Running UMAP...")
X_reduced = reducer.fit_transform(X_scaled, y=y_umap)

# 2. Gaussian Process Regression
# Identify Training set (Books you read) vs Prediction set (Unread)
train_mask = books_df['my_rating'].notna() & (books_df['my_rating'] > 0)
X_train = X_reduced[train_mask]
y_train = books_df.loc[train_mask, 'my_rating'].values

# Kernel: Matern handles irregularities better than RBF. WhiteKernel handles noise.
kernel = Matern(nu=1.5) + WhiteKernel(noise_level=0.1)
gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)

print("Fitting GPR...")
gpr.fit(X_train, y_train)

# Predict on ALL books
means, stds = gpr.predict(X_reduced, return_std=True)

books_df['pred_rating'] = means
books_df['uncertainty'] = stds

# PART D: Final Hybrid Scoring

# Conservative Personal Score: Prediction minus Uncertainty
# If model thinks it's 5.0 but is unsure (std=1.0), treats it as 4.0
books_df['safe_personal_score'] = books_df['pred_rating'] - (books_df['uncertainty'] * 0.5)

# Hybrid: 70% Personal Taste, 30% Global Quality
books_df['final_score'] = (0.7 * books_df['safe_personal_score']) + (0.3 * books_df['global_score'])

# Filter out books you've already read
recs = books_df[~train_mask].sort_values('final_score', ascending=False)

# Export
recs[['title', 'authors', 'global_score', 'pred_rating', 'uncertainty', 'final_score']].head(20).to_csv("data/final_recommendations.csv", index=False)
print("Done. Saved top 20 to data/final_recommendations.csv")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(title, embeddings, df, top_k=5):
    idx = df[df['title'] == title].index[0]
    sim_scores = cosine_similarity([embeddings[idx]], embeddings).flatten()
    top_indices = sim_scores.argsort()[-(top_k+1):-1][::-1]
    
    return df.iloc[top_indices][['title', 'authors', 'genres']]

# Test it out
# print(get_recommendations("The Way of Kings", final_embeddings, df))

In [None]:
goodreads_export = pd.read_csv('data/goodreads_library_export.csv')
# goodreads_export = pd.read_csv('data/20-01-2025_goodreads_library_export.csv')
# goodreads_export = goodreads_export[['Book Id', 'Author', 'My Rating', 'Number of Pages', 'Original Publication Year']]
# goodreads_export = goodreads_export.rename(columns={'Book Id':'book_id',
#                                                     'Author': 'author',
#                                                     'My Rating': 'my_rating',
#                                                     'Number of Pages': 'num_pages',
#                                                     'Original Publication Year': 'year'})
threshold = (goodreads_export['number_of_pages'].mean() - goodreads_export['number_of_pages'].std())
goodreads_export.loc[goodreads_export['number_of_pages'] < threshold, 'number_of_pages'] = np.nan

book_df = pd.read_csv('data/books.csv')
# book_df = pd.read_csv('data/01-2025_goodreads_scraped.csv')
df = goodreads_export.merge(book_df, on='book_id')

# Drop competing columns
df['author'] = df['author_x'].fillna(df['author_y'])
df['num_pages'] = df['num_pages_x'].fillna(df['num_pages_y'])
df['year'] = df['year_x'].fillna(df['year_y'])
df.drop(columns=['author_x', 'author_y', 'num_pages_x', 'num_pages_y', 'year_x', 'year_y'], inplace=True)

df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['year'] = df['year'].fillna(df['year'].mean()).round().astype(int)
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].mean()).round().astype(int)
df['num_reviews'] = df['num_reviews'].fillna(0).round().astype(int)
df['my_rating'] = df['my_rating'].replace(0,np.nan)

df['age'] = int(datetime.now().strftime('%Y')) - df['year']
df['average_rating'] = ((df['5 stars'] * 5) + (df['4 stars'] * 4) + (df['3 stars'] * 3) + (df['2 stars'] * 2) + df['1 star']) / df['num_ratings']
df = df[['book_id', 'title', 'author', 'year', 'age', 'series', 'num_pages', 'genres', 'num_ratings', 'num_reviews', 'my_rating', 'average_rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star']]

In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([a, b, c])

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']
coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['a'] - df['a'].min()
df['2a_shifted'] = df['2a_shifted'] * (1 / df['2a_shifted'].max()) + 1
df['b_shifted'] = df['b'] - df['b'].min()
df['b_shifted'] = df['b_shifted'] * (1 / df['b_shifted'].max()) + 1
df['c_shifted'] = df['c'] - df['c'].min()
df['c_shifted'] = df['c_shifted'] * (1 / df['c_shifted'].max()) + 1

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['coeff_2a_rating'] = (df['num_adjusted_rating'] * df['2a_shifted'])
df['coeff_b_rating'] = (df['num_adjusted_rating']) / (df['b_shifted'])
df['coeff_c_rating'] = (df['num_adjusted_rating'] * df['c_shifted'])
df['joined_rating'] = (df['num_adjusted_rating'] * df['c_shifted'] * df['2a_shifted']) / df['b_shifted']
df['final_rating'] = df['joined_rating'] - (df['joined_rating'] - df['joined_rating'].mean()) / df['num_ratings_ln']

df['num_adjusted_page_rating'] = df['num_adjusted_rating'] / (df['num_pages_ln'])
df['coeff_2a_page_rating'] = df['coeff_2a_rating'] / df['num_pages_ln']
df['coeff_b_page_rating'] = df['coeff_b_rating'] / df['num_pages_ln']
df['coeff_c_page_rating'] = df['coeff_c_rating'] / df['num_pages_ln']
df['joined_page_rating'] = df['joined_rating'] / df['num_pages_ln']
df['final_page_rating'] = df['joined_page_rating'] - (df['joined_page_rating'] - df['joined_page_rating'].mean()) / df['num_ratings_ln']

In [None]:
numeric_cols = ['age', 'num_pages', 'num_pages_ln', 'num_ratings', 'num_ratings_ln', 'num_reviews', 'my_rating', 'average_rating', '1 star', '2 stars', '3 stars', '4 stars', '5 stars', '1_star_percentage', '2_star_percentage', '3_star_percentage', '4_star_percentage', '5_star_percentage', 'a', 'b', 'c', 'num_adjusted_rating', 'coeff_2a_rating', 'coeff_b_rating', 'coeff_c_rating', 'joined_rating', 'final_rating', 'num_adjusted_page_rating', 'coeff_2a_page_rating', 'coeff_b_page_rating', 'coeff_c_page_rating', 'joined_page_rating', 'final_page_rating']
corr_df= df[numeric_cols].corr()

plt.figure(figsize=(20, 15)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
fresh = df.sort_values(by='final_page_rating', ascending=False).reset_index().drop('index', axis=1)
fresh = fresh[fresh['my_rating'].isna()]
fresh[['Fiction' in genre_list for genre_list in fresh['genres']]] # Fiction, Nonfiction, Memoir, Classics, History, Politics, Philosophy, Business