In [None]:
# Load and prep data
import pandas as pd
import numpy as np

books_df = pd.read_csv("data/books.csv")
star_cols = [c for c in books_df.columns if c.endswith('star')]
books_df['rating_count'] = books_df[star_cols].sum(axis=1)

# Todo: Fix the crawler to split langs with "|" by default
books_df['lang'] = [
    "|".join(item.strip() for item in x.split(";")) if isinstance(x, str) else x 
    for x in books_df['lang']
]
books_df['description'] = books_df['description'].str.replace('\n\n', '\n')
books_df['description'] = books_df['description'].str.replace('\n', ' ')
books_df['description'] = books_df['description'].str.replace('   ', ' ')
books_df['description'] = books_df['description'].str.replace('  ', ' ')

goodreads_df = pd.read_csv('data/goodreads_library_export.csv')
goodreads_df['my_rating'] = goodreads_df['my_rating'].astype('UInt8')
goodreads_df['my_rating'] = goodreads_df['my_rating'].replace(0, np.nan)

books_df = books_df.merge(goodreads_df[['book_id', 'my_rating']], on='book_id', how='left')
# books_df = books_df[~books_df['similar_books'].isna()].reset_index(drop=True)

In [None]:
# Prep embedding strings
def format_string_for_embedding(items, kind=None, truncate=0):
    if not isinstance(items, (list)) or len(items) == 0:
        return ""

    n = len(items)
    if n == 1:
        res = items[0]
    elif n > truncate > 1:
        res = f"{', '.join(items[:truncate])}, and {items[truncate]}"
    else:
        res = f"{', '.join(items[:-1])}{',' if n > 2 else ''} and {items[-1]}"
    
    prefix = f"{kind.capitalize()}{'s' if n > 1 else ''}: " if kind else ""
    return f"{prefix}{res}"

books_df['authors_post'] = books_df['authors'].str.split('|')
books_df['authors_post'] = books_df['authors_post'].apply(lambda x:format_string_for_embedding(x, truncate=4))

books_df['genres_post'] = books_df['genres'].str.split('|')
books_df['genres_post'] = books_df['genres_post'].apply(lambda x:format_string_for_embedding(x, kind='genre'))

books_df['desc_post'] = [[desc] if isinstance(desc, str) else [] for desc in books_df['description']]
books_df['desc_post'] = books_df['desc_post'].apply(lambda x:format_string_for_embedding(x, kind='description'))

def join_embedding_parts(title, authors, genres, desc):
    text = f"Book: {title}\n"
    if authors:
        text += f"Written by: {authors}\n"
    if genres:
        text += f"{genres}\n"
    if desc:
        text += f"{desc}" 
    return text

books_df['embedding_input'] = [
    join_embedding_parts(t, a, g, d) 
    for t, a, g, d in zip(books_df['title'], books_df['authors_post'], books_df['genres_post'], books_df['desc_post'])
]
id_to_string = books_df.set_index('book_id')['embedding_input']

In [None]:
# Embed sentences
import os
import ollama
from tqdm import tqdm
import torch
from sklearn.preprocessing import Normalizer

PARAMS = 8
OLLAMA_MODEL = f"qwen3-embedding:{PARAMS}b"
MIN_DIMENSIONS = 32

train_size = (~books_df['my_rating'].isna()).sum()
mrl_dimensions = MIN_DIMENSIONS
while mrl_dimensions*2 < train_size:
    mrl_dimensions*=2

embeddings_path = f'data/{PARAMS}b_embeddings.csv'
if os.path.exists(embeddings_path):
    embeddings = pd.read_csv(embeddings_path).set_index('book_id')
    embeddings.columns = embeddings.columns.astype(int)
    embeddings = pd.DataFrame(embeddings).dropna(subset=[0])
else:
    embeddings = pd.DataFrame()

current_ids = books_df['book_id'].values
missing_ids = [idx for idx in current_ids if idx not in embeddings.index]
if missing_ids:
    missing_strings = id_to_string.loc[missing_ids].tolist()
    batch_size = 128
    new_embeddings = []
    for i in tqdm(range(0, len(missing_strings), batch_size)):
        batch = missing_strings[i : i + batch_size]
        response = ollama.embed(model=OLLAMA_MODEL, input=batch)
        new_embeddings.extend(response['embeddings'])

    new_embeddings = pd.DataFrame(new_embeddings, index=missing_ids)
    new_embeddings.index.name = 'book_id'
    embeddings = pd.concat([embeddings, new_embeddings])
    embeddings.to_csv(embeddings_path)
    del new_embeddings, missing_strings, missing_ids, current_ids

embeddings = embeddings.loc[books_df['book_id']].values
embeddings = embeddings[:, :mrl_dimensions*2] # qwen-3 embedding supports MRL
norm = Normalizer(norm='l2')
embeddings = norm.fit_transform(embeddings)
embeddings = torch.tensor(embeddings, dtype=torch.float32)

In [None]:
# Build adjacency matrix
from torch_geometric.utils import add_self_loops
from torch_geometric.nn.conv.gcn_conv import gcn_norm

id_to_idx = {id: i for i, id in enumerate(books_df['book_id'])}

edge_indices = []
for idx, row in tqdm(books_df.iterrows(), total=len(books_df)):
    current_idx = id_to_idx[row['book_id']]
    if pd.isna(row['similar_books']):
        continue
    for item in row['similar_books'].split('|'):
        try:
            target_id = int(item.split(':')[0])
            if target_id in books_df['book_id'].values:
                target_idx = id_to_idx[target_id]
                edge_indices.append([current_idx, target_idx])
                edge_indices.append([target_idx, current_idx])
        except (ValueError, IndexError):
            continue

if not edge_indices:
    edge_index = torch.tensor([[], []], dtype=torch.long)
else:
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

edge_index_with_loops, _ = add_self_loops(edge_index, num_nodes=embeddings.size(0))
edge_index_norm, edge_weight_norm = gcn_norm(edge_index_with_loops, num_nodes=embeddings.size(0))
adj_matrix = torch.sparse_coo_tensor(edge_index_norm, edge_weight_norm, (embeddings.size(0), embeddings.size(0)))

In [None]:
import nevergrad as ng
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.metrics import ndcg_score, mean_squared_error
from scipy.stats import spearmanr

def objective(num_propagations, 
              knn_neighbors,
              brr_alpha_1, brr_alpha_2, brr_lambda_1, brr_lambda_2, brr_uncertainty_penalty,
              svr_C, svr_epsilon,
              knn_weight, brr_weight
              ):
    
    all_embeddings = precomputed_embeddings[num_propagations]
    X = all_embeddings[my_ratings_mask]
    y = my_ratings

    y_trues = []
    y_preds = []
    skf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, test_idx in skf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        knn = KNeighborsRegressor(
            n_neighbors=knn_neighbors,
            metric='cosine', 
            weights='distance',
            n_jobs=-1,
            )
        knn.fit(X_train, y_train)
        knn_pred = knn.predict(X_test)

        brr = BayesianRidge(
            alpha_1=brr_alpha_1,
            alpha_2=brr_alpha_2,
            lambda_1=brr_lambda_1,
            lambda_2=brr_lambda_2,
            compute_score=True
            )
        brr.fit(X_train, y_train)
        brr_mu, brr_std = brr.predict(X_test, return_std=True)
        brr_pred = brr_mu - (brr_uncertainty_penalty * brr_std)

        svr = SVR(
            kernel='rbf',
            gamma='scale',
            C=svr_C, 
            epsilon=svr_epsilon
            ) 
        svr.fit(X_train, y_train)
        svr_pred = svr.predict(X_test)

        remaining_weight = 1 - knn_weight
        brr_weight *= remaining_weight
        svr_weight = remaining_weight - brr_weight
        final_pred = (knn_weight * knn_pred) + (brr_weight * brr_pred) + (svr_weight * svr_pred)

        y_trues.append(y_test)
        y_preds.append(final_pred)

    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    y_preds = np.clip(y_preds, 1, 5)
    
    mse = mean_squared_error(y_trues, y_preds)
    ndcg = ndcg_score([y_trues], [y_preds])
    if np.std(y_preds) < 1e-9:
        spearman = 0.0
    else:
        spearman, _ = spearmanr(y_trues, y_preds)
        if np.isnan(spearman): spearman = 0.0

    return mse + (1.0 - ndcg) + (1.0 - spearman)

MAX_PROPAGATIONS = 2
propagated = embeddings.clone()
norm_l2 = Normalizer(norm='l2')
precomputed_embeddings = [norm_l2.transform(propagated.numpy())]

for _ in range(MAX_PROPAGATIONS):
    propagated = torch.sparse.mm(adj_matrix, propagated)
    precomputed_embeddings.append(norm_l2.transform(propagated.numpy()))
del propagated

my_ratings_mask = ~books_df['my_rating'].isna()
my_ratings = books_df.loc[my_ratings_mask, 'my_rating'].values

parametrization = ng.p.Instrumentation(
    num_propagations = ng.p.Scalar(lower=1, upper=MAX_PROPAGATIONS).set_integer_casting(),

    knn_neighbors = ng.p.Scalar(lower=3, upper=mrl_dimensions//2).set_integer_casting(),
    brr_alpha_1=ng.p.Scalar(lower=1e-7, upper=1e-3),
    brr_alpha_2=ng.p.Scalar(lower=1e-7, upper=1e-3),
    brr_lambda_1=ng.p.Log(lower=1e-6, upper=1e-1),
    brr_lambda_2=ng.p.Log(lower=1e-6, upper=1e-1),
    brr_uncertainty_penalty=ng.p.Scalar(lower=0, upper=2.0),
    svr_C = ng.p.Log(lower=1e-3, upper=1e2),
    svr_epsilon = ng.p.Scalar(lower=0.0, upper=1.0),

    knn_weight=ng.p.Scalar(lower=0, upper=1), 
    brr_weight=ng.p.Scalar(lower=0, upper=1), 
)

BUDGET = 300
optimizer = ng.optimizers.NGOpt(parametrization=parametrization, budget=BUDGET)
best_loss = float('inf')
with tqdm(total=BUDGET) as pbar:
    for i in range(BUDGET):
        x = optimizer.ask()
        loss = objective(*x.args, **x.kwargs)
        optimizer.tell(x, loss)
        if loss < best_loss:
            best_loss = loss
        pbar.update(1)
        if i % 10 == 0:
            pbar.set_description(f"Best Loss: {best_loss:.4f}")

best_params = optimizer.provide_recommendation().kwargs

In [None]:
"""8 MRL sliced
Best Loss: 1.5714: 100%|██████████| 300/300 [11:49<00:00,  2.37s/it]
{'num_propagations': 0,
 'knn_neighbors': 17,
 'brr_alpha_1': 0.0006421576956932416,
 'brr_alpha_2': 0.0004509358497670652,
 'brr_lambda_1': 8.24637589893822e-06,
 'brr_lambda_2': 0.0001968595842801841,
 'brr_uncertainty_penalty': 1.3679433872638418,
 'svr_C': 0.004026523432673821,
 'svr_epsilon': 0.3118081208045217,
 'knn_weight': 0.5805897543886746,
 'brr_weight': 0.7793002684038448}"""

"""8 full
Best Loss: 1.6065: 100%|██████████| 300/300 [16:57<00:00,  3.39s/it]
 {'num_propagations': 0,
 'knn_neighbors': 36,
 'brr_alpha_1': 0.0006880608281295565,
 'brr_alpha_2': 0.0003014766980578955,
 'brr_lambda_1': 1.3770534826263817e-05,
 'brr_lambda_2': 0.005746291033999427,
 'brr_uncertainty_penalty': 1.8570930538472121,
 'svr_C': 0.03383255010384741,
 'svr_epsilon': 0.44291277801367485,
 'knn_weight': 0.5848794696920626,
 'brr_weight': 0.5411997331122358}"""

"""0.6 MRL sliced
Best Loss: 1.5748: 100%|██████████| 300/300 [10:29<00:00,  2.10s/it]
{'num_propagations': 0,
 'knn_neighbors': 37,
 'brr_alpha_1': 0.0002978589454364143,
 'brr_alpha_2': 0.0005679816409316726,
 'brr_lambda_1': 0.0003994602308099561,
 'brr_lambda_2': 0.003659493431608586,
 'brr_uncertainty_penalty': 1.3409360588302992,
 'svr_C': 0.009413204523151376,
 'svr_epsilon': 0.5861865063902226,
 'knn_weight': 0.6849389456290866,
 'brr_weight': 0.7766436222538672}"""

"""0.6 full
Best Loss: 1.5926: 100%|██████████| 300/300 [12:22<00:00,  2.48s/it]
{'num_propagations': 0,
 'knn_neighbors': 55,
 'brr_alpha_1': 0.0007517429857762481,
 'brr_alpha_2': 0.0003941767071639492,
 'brr_lambda_1': 8.604754211823616e-06,
 'brr_lambda_2': 0.008083208576838542,
 'brr_uncertainty_penalty': 1.048016018935488,
 'svr_C': 0.013400702917698843,
 'svr_epsilon': 0.4149283525703265,
 'knn_weight': 0.6826121601964188,
 'brr_weight': 0.8414845392757134}"""
best_params

In [None]:
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
import time

all_embeddings = precomputed_embeddings[best_params['num_propagations']]

X_train = all_embeddings[my_ratings_mask]
y_train = my_ratings

knn = KNeighborsRegressor(
    n_neighbors=best_params['knn_neighbors'],
    metric='cosine', 
    weights='distance',
    n_jobs=-1,
    )
knn.fit(X_train, y_train)
knn_pred = knn.predict(all_embeddings)

brr = BayesianRidge(
    alpha_1=best_params['brr_alpha_1'],
    alpha_2=best_params['brr_alpha_2'],
    lambda_1=best_params['brr_lambda_1'],
    lambda_2=best_params['brr_lambda_2'],
    compute_score=True
    )
brr.fit(X_train, y_train)
brr_mu, brr_std = brr.predict(all_embeddings, return_std=True)
brr_pred = brr_mu - (best_params['brr_uncertainty_penalty'] * brr_std)

svr = SVR(
    kernel='rbf',
    gamma='scale',
    C=best_params['svr_C'], 
    epsilon=best_params['svr_epsilon']
    ) 
svr.fit(X_train, y_train)
svr_pred = svr.predict(all_embeddings)

knn_weight = best_params['knn_weight']
brr_weight = best_params['brr_weight']

remaining_weight = 1 - knn_weight
brr_weight *= remaining_weight
svr_weight = remaining_weight - brr_weight
final_pred = (knn_weight * knn_pred) + (brr_weight * brr_pred) + (svr_weight * svr_pred)

# start = time.time()
# knn = KNeighborsRegressor(
#     n_neighbors=best_params['knn_neighbors'],
#     metric='cosine', 
#     weights='distance',
#     n_jobs=-1,
#     )
# knn.fit(X_train, y_train)
# knn_pred = knn.predict(all_embeddings)
# print(f'KNN: {time.time() - start}')

# start = time.time()
# brr = BayesianRidge(
#     alpha_1 = best_params['brr_alpha_1'],
#     alpha_2 = best_params['brr_alpha_2'],
#     lambda_1 = best_params['brr_lambda_1'],
#     lambda_2 = best_params['brr_lambda_2'],
#     )
# brr.fit(X_train, y_train)
# brr_pred = brr.predict(all_embeddings)
# print(f'BRR: {time.time() - start}')
# 
# brr_weight = 1 - best_params['knn_weight']
# final_pred = ((best_params['knn_weight'] * knn_pred) + (brr_weight * brr_pred))

books_df['knn_rating'] = knn_pred
books_df['brr_rating'] = brr_pred
books_df['svr_rating'] = svr_pred
books_df['final_rating'] = final_pred
# books_df.sort_values(by='final_rating', ascending=False)

cols = ['title'] + [col for col in books_df.columns if col.endswith('rating')]
books_df[cols].sort_values(by='final_rating', ascending=False)

In [None]:
"""0.6 MRL sliced
Best Loss: 1.5748: 100%|██████████| 300/300 [10:29<00:00,  2.10s/it]
{'num_propagations': 0,
 'knn_neighbors': 37,
 'brr_alpha_1': 0.0002978589454364143,
 'brr_alpha_2': 0.0005679816409316726,
 'brr_lambda_1': 0.0003994602308099561,
 'brr_lambda_2': 0.003659493431608586,
 'brr_uncertainty_penalty': 1.3409360588302992,
 'svr_C': 0.009413204523151376,
 'svr_epsilon': 0.5861865063902226,
 'knn_weight': 0.6849389456290866,
 'brr_weight': 0.7766436222538672}"""
cols = ['title'] + [col for col in books_df.columns if col.endswith('rating')]
books_df[cols].sort_values(by='final_rating', ascending=False)

In [None]:
"""0.6 full
Best Loss: 1.5926: 100%|██████████| 300/300 [12:22<00:00,  2.48s/it]
{'num_propagations': 0,
 'knn_neighbors': 55,
 'brr_alpha_1': 0.0007517429857762481,
 'brr_alpha_2': 0.0003941767071639492,
 'brr_lambda_1': 8.604754211823616e-06,
 'brr_lambda_2': 0.008083208576838542,
 'brr_uncertainty_penalty': 1.048016018935488,
 'svr_C': 0.013400702917698843,
 'svr_epsilon': 0.4149283525703265,
 'knn_weight': 0.6826121601964188,
 'brr_weight': 0.8414845392757134}"""
cols = ['title'] + [col for col in books_df.columns if col.endswith('rating')]
books_df[cols].sort_values(by='final_rating', ascending=False)

In [None]:
"""8 MRL sliced
Best Loss: 1.5714: 100%|██████████| 300/300 [11:49<00:00,  2.37s/it]
{'num_propagations': 0,
 'knn_neighbors': 17,
 'brr_alpha_1': 0.0006421576956932416,
 'brr_alpha_2': 0.0004509358497670652,
 'brr_lambda_1': 8.24637589893822e-06,
 'brr_lambda_2': 0.0001968595842801841,
 'brr_uncertainty_penalty': 1.3679433872638418,
 'svr_C': 0.004026523432673821,
 'svr_epsilon': 0.3118081208045217,
 'knn_weight': 0.5805897543886746,
 'brr_weight': 0.7793002684038448}"""
cols = ['title'] + [col for col in books_df.columns if col.endswith('rating')]
books_df[cols].sort_values(by='final_rating', ascending=False)

In [None]:
"""8 full
Best Loss: 1.6065: 100%|██████████| 300/300 [16:57<00:00,  3.39s/it]
 {'num_propagations': 0,
 'knn_neighbors': 36,
 'brr_alpha_1': 0.0006880608281295565,
 'brr_alpha_2': 0.0003014766980578955,
 'brr_lambda_1': 1.3770534826263817e-05,
 'brr_lambda_2': 0.005746291033999427,
 'brr_uncertainty_penalty': 1.8570930538472121,
 'svr_C': 0.03383255010384741,
 'svr_epsilon': 0.44291277801367485,
 'knn_weight': 0.5848794696920626,
 'brr_weight': 0.5411997331122358}"""
cols = ['title'] + [col for col in books_df.columns if col.endswith('rating')]
books_df[cols].sort_values(by='final_rating', ascending=False)

In [None]:
stop
import nevergrad as ng
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import ndcg_score, mean_squared_error
from scipy.stats import spearmanr


def objective(num_propagations, 
              pls_n_components, 
              knn_neighbors, knn_leaf_size,
              svr_C, svr_epsilon,
              brr_alpha_1, brr_alpha_2, brr_lambda_1, brr_lambda_2,
              ensemble_logits,
              ):
    
    all_embeddings = precomputed_embeddings[num_propagations]
    X = all_embeddings[my_ratings_mask]
    y = my_ratings

    y_trues = []
    y_preds = []
    skf = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_idx, test_idx in skf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        knn = KNeighborsRegressor(
            n_neighbors=knn_neighbors,
            leaf_size=knn_leaf_size,
            metric='cosine', 
            weights='distance',
            n_jobs=-1,
            )
        knn.fit(X_train, y_train)
        knn_pred = knn.predict(X_test)

        svr = SVR(kernel='rbf', 
            C=svr_C, 
            epsilon=svr_epsilon
            ) 
        svr.fit(X_train, y_train)
        svr_pred = svr.predict(X_test)

        brr = BayesianRidge(
            alpha_1=brr_alpha_1,
            alpha_2=brr_alpha_2,
            lambda_1=brr_lambda_1,
            lambda_2=brr_lambda_2,
            compute_score=True
            )
        brr.fit(X_train, y_train)
        brr_pred = brr.predict(X_test)

        n_comps = min(pls_n_components, len(X_train) - 1)
        pls = PLSRegression(
            n_components=n_comps,
            scale=False,
            )
        pls.fit(X_train, y_train)
        pls_pred = pls.predict(X_test).flatten()


        full_logits = np.concatenate(([0.0], ensemble_logits))
        exp_vals = np.exp(full_logits - np.max(full_logits))
        weights = exp_vals / np.sum(exp_vals)
        knn_weight, svr_weight, brr_weight, pls_weight = weights

        final_pred = (knn_weight * knn_pred)+ (svr_weight * svr_pred) + (brr_weight * brr_pred) + (pls_weight * pls_pred) 

        y_trues.append(y_test)
        y_preds.append(final_pred)

    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    y_preds = np.clip(y_preds, 1, 5)
    
    mse = mean_squared_error(y_trues, y_preds)
    ndcg = ndcg_score([y_trues], [y_preds])
    if np.std(y_preds) < 1e-9:
        spearman = 0.0
    else:
        spearman, _ = spearmanr(y_trues, y_preds)
        if np.isnan(spearman): spearman = 0.0

    return mse + (1.0 - spearman) + (1.0 - ndcg)

MAX_PROPAGATIONS = 3
propagated = embeddings.clone()
norm_l2 = Normalizer(norm='l2')
precomputed_embeddings = [norm_l2.transform(propagated.numpy())]

for _ in range(MAX_PROPAGATIONS):
    propagated = torch.sparse.mm(adj_matrix, propagated)
    precomputed_embeddings.append(norm_l2.transform(propagated.numpy()))
del propagated

my_ratings_mask = ~books_df['my_rating'].isna()
my_ratings = books_df.loc[my_ratings_mask, 'my_rating'].values

parametrization = ng.p.Instrumentation(
    num_propagations = ng.p.Scalar(lower=0, upper=MAX_PROPAGATIONS).set_integer_casting(),

    knn_neighbors = ng.p.Scalar(lower=3, upper=mrl_dimensions//2).set_integer_casting(),
    knn_leaf_size = ng.p.Scalar(lower=3, upper=mrl_dimensions//2).set_integer_casting(),
    pls_n_components = ng.p.Scalar(lower=1, upper=mrl_dimensions//2).set_integer_casting(),
    svr_C = ng.p.Log(lower=1e-3, upper=1e3),
    svr_epsilon = ng.p.Scalar(lower=0.0, upper=1.0),
    brr_alpha_1=ng.p.Scalar(lower=1e-7, upper=1e-3),
    brr_alpha_2=ng.p.Scalar(lower=1e-7, upper=1e-3),
    brr_lambda_1=ng.p.Log(lower=1e-6, upper=1e-1),
    brr_lambda_2=ng.p.Log(lower=1e-6, upper=1e-1),

    ensemble_logits = ng.p.Array(shape=(3,)), 
)

BUDGET = 300
optimizer = ng.optimizers.NGOpt(parametrization=parametrization, budget=BUDGET)
best_loss = float('inf')
with tqdm(total=BUDGET) as pbar:
    for i in range(BUDGET):
        x = optimizer.ask()
        loss = objective(*x.args, **x.kwargs)
        optimizer.tell(x, loss)
        if loss < best_loss:
            best_loss = loss
        pbar.update(1)
        if i % 10 == 0:
            pbar.set_description(f"Best Loss: {best_loss:.4f}")

best_params = optimizer.provide_recommendation().kwargs

In [None]:
# Global score
C = books_df['avg_rating'].mean()
m = books_df['rating_count'].quantile(0.10) 
def weighted_rating(x, m=m, C=C):
    v = float(x['rating_count'])
    R = float(x['avg_rating'])
    if v == 0: 
        return C
    return (v / (v + m) * R) + (m / (v + m) * C)
books_df['global_score'] = books_df.apply(weighted_rating, axis=1)

# old

In [None]:
goodreads_export = pd.read_csv('data/goodreads_library_export.csv')
# goodreads_export = pd.read_csv('data/20-01-2025_goodreads_library_export.csv')
# goodreads_export = goodreads_export[['Book Id', 'Author', 'My Rating', 'Number of Pages', 'Original Publication Year']]
# goodreads_export = goodreads_export.rename(columns={'Book Id':'book_id',
#                                                     'Author': 'author',
#                                                     'My Rating': 'my_rating',
#                                                     'Number of Pages': 'num_pages',
#                                                     'Original Publication Year': 'year'})
threshold = (goodreads_export['number_of_pages'].mean() - goodreads_export['number_of_pages'].std())
goodreads_export.loc[goodreads_export['number_of_pages'] < threshold, 'number_of_pages'] = np.nan

book_df = pd.read_csv('data/books.csv')
# book_df = pd.read_csv('data/01-2025_goodreads_scraped.csv')
df = goodreads_export.merge(book_df, on='book_id')

# Drop competing columns
df['author'] = df['author_x'].fillna(df['author_y'])
df['num_pages'] = df['num_pages_x'].fillna(df['num_pages_y'])
df['year'] = df['year_x'].fillna(df['year_y'])
df.drop(columns=['author_x', 'author_y', 'num_pages_x', 'num_pages_y', 'year_x', 'year_y'], inplace=True)

df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['year'] = df['year'].fillna(df['year'].mean()).round().astype(int)
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].mean()).round().astype(int)
df['num_reviews'] = df['num_reviews'].fillna(0).round().astype(int)
df['my_rating'] = df['my_rating'].replace(0,np.nan)

df['age'] = int(datetime.now().strftime('%Y')) - df['year']
df['average_rating'] = ((df['5 stars'] * 5) + (df['4 stars'] * 4) + (df['3 stars'] * 3) + (df['2 stars'] * 2) + df['1 star']) / df['num_ratings']
df = df[['book_id', 'title', 'author', 'year', 'age', 'series', 'num_pages', 'genres', 'num_ratings', 'num_reviews', 'my_rating', 'average_rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star']]

In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([a, b, c])

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']
coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['a'] - df['a'].min()
df['2a_shifted'] = df['2a_shifted'] * (1 / df['2a_shifted'].max()) + 1
df['b_shifted'] = df['b'] - df['b'].min()
df['b_shifted'] = df['b_shifted'] * (1 / df['b_shifted'].max()) + 1
df['c_shifted'] = df['c'] - df['c'].min()
df['c_shifted'] = df['c_shifted'] * (1 / df['c_shifted'].max()) + 1

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['coeff_2a_rating'] = (df['num_adjusted_rating'] * df['2a_shifted'])
df['coeff_b_rating'] = (df['num_adjusted_rating']) / (df['b_shifted'])
df['coeff_c_rating'] = (df['num_adjusted_rating'] * df['c_shifted'])
df['joined_rating'] = (df['num_adjusted_rating'] * df['c_shifted'] * df['2a_shifted']) / df['b_shifted']
df['final_rating'] = df['joined_rating'] - (df['joined_rating'] - df['joined_rating'].mean()) / df['num_ratings_ln']

df['num_adjusted_page_rating'] = df['num_adjusted_rating'] / (df['num_pages_ln'])
df['coeff_2a_page_rating'] = df['coeff_2a_rating'] / df['num_pages_ln']
df['coeff_b_page_rating'] = df['coeff_b_rating'] / df['num_pages_ln']
df['coeff_c_page_rating'] = df['coeff_c_rating'] / df['num_pages_ln']
df['joined_page_rating'] = df['joined_rating'] / df['num_pages_ln']
df['final_page_rating'] = df['joined_page_rating'] - (df['joined_page_rating'] - df['joined_page_rating'].mean()) / df['num_ratings_ln']

In [None]:
numeric_cols = ['age', 'num_pages', 'num_pages_ln', 'num_ratings', 'num_ratings_ln', 'num_reviews', 'my_rating', 'average_rating', '1 star', '2 stars', '3 stars', '4 stars', '5 stars', '1_star_percentage', '2_star_percentage', '3_star_percentage', '4_star_percentage', '5_star_percentage', 'a', 'b', 'c', 'num_adjusted_rating', 'coeff_2a_rating', 'coeff_b_rating', 'coeff_c_rating', 'joined_rating', 'final_rating', 'num_adjusted_page_rating', 'coeff_2a_page_rating', 'coeff_b_page_rating', 'coeff_c_page_rating', 'joined_page_rating', 'final_page_rating']
corr_df= df[numeric_cols].corr()

plt.figure(figsize=(20, 15)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
fresh = df.sort_values(by='final_page_rating', ascending=False).reset_index().drop('index', axis=1)
fresh = fresh[fresh['my_rating'].isna()]
fresh[['Fiction' in genre_list for genre_list in fresh['genres']]] # Fiction, Nonfiction, Memoir, Classics, History, Politics, Philosophy, Business