In [None]:
import pandas as pd
import numpy as np
import ollama
from tqdm import tqdm 

# Load Data
goodreads_df = pd.read_csv('data/goodreads_library_export.csv')
book_df = pd.read_csv("data/books.csv")
book_df['year'] = book_df['year'].astype('Int32')
OLLAMA_MODEL = "qwen3-embedding:0.6b"

book_df['book_id'] = book_df['book_id'].astype(str)
goodreads_df['book_id'] = goodreads_df['book_id'].astype(str)
book_df = book_df.merge(goodreads_df[['book_id', 'my_rating']], on='book_id', how='left')

# Sentence embeddings
all_genres = book_df['genres'].str.split('|').explode().str.strip().unique()
all_genres = [g for g in all_genres if isinstance(g, str) and len(g) > 0]

BATCH_SIZE = 64
genre_embeddings = {}
for i in tqdm(range(0, len(all_genres), BATCH_SIZE)):
    batch = all_genres[i : i + BATCH_SIZE]
    response = ollama.embed(model=OLLAMA_MODEL, input=batch)
    for genre, vector in zip(batch, response['embeddings']):
        genre_embeddings[genre] = np.array(vector)

genre_embeddings

In [None]:
from node2vec import Node2Vec
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel
import networkx as nx
import umap


# 2. Graph Embeddings (Structure)
G = nx.Graph()
G.add_nodes_from(book_df['book_id'].tolist())

# Edge Type A: "Similar Books" (from Goodreads)
for idx, row in book_df.iterrows():
    if pd.isna(row['similar_books']): continue
    for item in row['similar_books'].split('|'):
        try:
            target_id = item.split(':')[0]
            if target_id in book_df['book_id'].values:
                G.add_edge(row['book_id'], target_id, weight=1.0)
        except ValueError: continue

# # Edge Type B: Series (Strong connection)
# # Connecting books in the same series strengthens the graph significantly
# series_groups = book_df[book_df['series'].notna()].groupby('series')['book_id'].apply(list)
# for books in series_groups:
#     for i in range(len(books)-1):
#         G.add_edge(books[i], books[i+1], weight=10.0) # High weight for series

node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=50, workers=4, quiet=True)
model_n2v = node2vec.fit(window=10, min_count=1, batch_words=4)

graph_embeddings = np.array([
    model_n2v.wv[bid] if bid in model_n2v.wv else np.zeros(64) 
    for bid in book_df['book_id']
])

# 3. Concatenate
final_embeddings = np.hstack([
    normalize(content_embeddings, norm='l2'), 
    normalize(graph_embeddings, norm='l2')
])

# PART B: The Global Score (Pop-Rank)

C = book_df['avg_rating'].mean()
m = book_df['review_count'].quantile(0.10) # 10th percentile as minimum votes

def weighted_rating(x, m=m, C=C):
    v = x['review_count']
    R = x['avg_rating']
    if v == 0: return C
    return (v/(v+m) * R) + (m/(v+m) * C)

book_df['global_score'] = book_df.apply(weighted_rating, axis=1)

# PART C: The Personal Score (UMAP + GPR)

# 1. Semi-Supervised UMAP
# We need a target array. -1 signals "unlabeled" to UMAP.
# This warps the space so your rated books clump together based on score.
y_umap = book_df['my_rating'].fillna(-1).values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(final_embeddings)

reducer = umap.UMAP(
    n_neighbors=15,
    n_components=10,    # Reduce to 10 dims for the regressor
    metric='cosine', 
    target_metric='l1', # Use L1 for the ratings
    target_weight=0.5,  # Balance structural shape vs rating shape
    random_state=42
)

print("Running UMAP...")
X_reduced = reducer.fit_transform(X_scaled, y=y_umap)

# 2. Gaussian Process Regression
# Identify Training set (Books you read) vs Prediction set (Unread)
train_mask = book_df['my_rating'].notna() & (book_df['my_rating'] > 0)
X_train = X_reduced[train_mask]
y_train = book_df.loc[train_mask, 'my_rating'].values

# Kernel: Matern handles irregularities better than RBF. WhiteKernel handles noise.
kernel = Matern(nu=1.5) + WhiteKernel(noise_level=0.1)
gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)

print("Fitting GPR...")
gpr.fit(X_train, y_train)

# Predict on ALL books
means, stds = gpr.predict(X_reduced, return_std=True)

book_df['pred_rating'] = means
book_df['uncertainty'] = stds

# PART D: Final Hybrid Scoring

# Conservative Personal Score: Prediction minus Uncertainty
# If model thinks it's 5.0 but is unsure (std=1.0), treats it as 4.0
book_df['safe_personal_score'] = book_df['pred_rating'] - (book_df['uncertainty'] * 0.5)

# Hybrid: 70% Personal Taste, 30% Global Quality
book_df['final_score'] = (0.7 * book_df['safe_personal_score']) + (0.3 * book_df['global_score'])

# Filter out books you've already read
recs = book_df[~train_mask].sort_values('final_score', ascending=False)

# Export
recs[['title', 'authors', 'global_score', 'pred_rating', 'uncertainty', 'final_score']].head(20).to_csv("data/final_recommendations.csv", index=False)
print("Done. Saved top 20 to data/final_recommendations.csv")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(title, embeddings, df, top_k=5):
    idx = df[df['title'] == title].index[0]
    sim_scores = cosine_similarity([embeddings[idx]], embeddings).flatten()
    top_indices = sim_scores.argsort()[-(top_k+1):-1][::-1]
    
    return df.iloc[top_indices][['title', 'authors', 'genres']]

# Test it out
# print(get_recommendations("The Way of Kings", final_embeddings, df))

In [None]:
goodreads_df = pd.read_csv('data/goodreads_library_export.csv')
goodreads_df

In [None]:
goodreads_export = pd.read_csv(new_export_path)
# goodreads_export = pd.read_csv('data/20-01-2025_goodreads_library_export.csv')
goodreads_export['Original Publication Year'] = goodreads_export['Original Publication Year'].fillna(goodreads_export['Year Published'])
goodreads_export = goodreads_export[['Book Id', 'Author', 'My Rating', 'Number of Pages', 'Original Publication Year']]
goodreads_export = goodreads_export.rename(columns={'Book Id':'book_id',
                                                    'Author': 'author',
                                                    'My Rating': 'my_rating',
                                                    'Number of Pages': 'num_pages',
                                                    'Original Publication Year': 'year'})
threshold = (goodreads_export['num_pages'].mean() - goodreads_export['num_pages'].std())
goodreads_export.loc[goodreads_export['num_pages'] < threshold, 'num_pages'] = np.nan

book_df = pd.read_csv(this_months_scrape_path)
# book_df = pd.read_csv('data/01-2025_goodreads_scraped.csv')
df = goodreads_export.merge(book_df, on='book_id')

# Drop competing columns
df['author'] = df['author_x'].fillna(df['author_y'])
df['num_pages'] = df['num_pages_x'].fillna(df['num_pages_y'])
df['year'] = df['year_x'].fillna(df['year_y'])
df.drop(columns=['author_x', 'author_y', 'num_pages_x', 'num_pages_y', 'year_x', 'year_y'], inplace=True)

df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['year'] = df['year'].fillna(df['year'].mean()).round().astype(int)
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].mean()).round().astype(int)
df['num_reviews'] = df['num_reviews'].fillna(0).round().astype(int)
df['my_rating'] = df['my_rating'].replace(0,np.nan)

df['age'] = int(datetime.now().strftime('%Y')) - df['year']
df['average_rating'] = ((df['5 stars'] * 5) + (df['4 stars'] * 4) + (df['3 stars'] * 3) + (df['2 stars'] * 2) + df['1 star']) / df['num_ratings']
df = df[['book_id', 'title', 'author', 'year', 'age', 'series', 'num_pages', 'genres', 'num_ratings', 'num_reviews', 'my_rating', 'average_rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star']]

In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([a, b, c])

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']
coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['a'] - df['a'].min()
df['2a_shifted'] = df['2a_shifted'] * (1 / df['2a_shifted'].max()) + 1
df['b_shifted'] = df['b'] - df['b'].min()
df['b_shifted'] = df['b_shifted'] * (1 / df['b_shifted'].max()) + 1
df['c_shifted'] = df['c'] - df['c'].min()
df['c_shifted'] = df['c_shifted'] * (1 / df['c_shifted'].max()) + 1

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['coeff_2a_rating'] = (df['num_adjusted_rating'] * df['2a_shifted'])
df['coeff_b_rating'] = (df['num_adjusted_rating']) / (df['b_shifted'])
df['coeff_c_rating'] = (df['num_adjusted_rating'] * df['c_shifted'])
df['joined_rating'] = (df['num_adjusted_rating'] * df['c_shifted'] * df['2a_shifted']) / df['b_shifted']
df['final_rating'] = df['joined_rating'] - (df['joined_rating'] - df['joined_rating'].mean()) / df['num_ratings_ln']

df['num_adjusted_page_rating'] = df['num_adjusted_rating'] / (df['num_pages_ln'])
df['coeff_2a_page_rating'] = df['coeff_2a_rating'] / df['num_pages_ln']
df['coeff_b_page_rating'] = df['coeff_b_rating'] / df['num_pages_ln']
df['coeff_c_page_rating'] = df['coeff_c_rating'] / df['num_pages_ln']
df['joined_page_rating'] = df['joined_rating'] / df['num_pages_ln']
df['final_page_rating'] = df['joined_page_rating'] - (df['joined_page_rating'] - df['joined_page_rating'].mean()) / df['num_ratings_ln']

In [None]:
numeric_cols = ['age', 'num_pages', 'num_pages_ln', 'num_ratings', 'num_ratings_ln', 'num_reviews', 'my_rating', 'average_rating', '1 star', '2 stars', '3 stars', '4 stars', '5 stars', '1_star_percentage', '2_star_percentage', '3_star_percentage', '4_star_percentage', '5_star_percentage', 'a', 'b', 'c', 'num_adjusted_rating', 'coeff_2a_rating', 'coeff_b_rating', 'coeff_c_rating', 'joined_rating', 'final_rating', 'num_adjusted_page_rating', 'coeff_2a_page_rating', 'coeff_b_page_rating', 'coeff_c_page_rating', 'joined_page_rating', 'final_page_rating']
corr_df= df[numeric_cols].corr()

plt.figure(figsize=(20, 15)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
fresh = df.sort_values(by='final_page_rating', ascending=False).reset_index().drop('index', axis=1)
fresh = fresh[fresh['my_rating'].isna()]
fresh[['Fiction' in genre_list for genre_list in fresh['genres']]] # Fiction, Nonfiction, Memoir, Classics, History, Politics, Philosophy, Business

In [None]:
test = pd.read_csv('books_data.csv')
test#['author'].iloc[]