In [None]:
# Load and prep data
import pandas as pd
import numpy as np

books_df = pd.read_csv("data/books.csv")
star_cols = [c for c in books_df.columns if c.endswith('star')]
books_df['rating_count'] = books_df[star_cols].sum(axis=1)

# Todo: Fix the crawler to split langs with "|" by default
books_df['lang'] = [
    "|".join(item.strip() for item in x.split(";")) if isinstance(x, str) else x 
    for x in books_df['lang']
]
books_df['description'] = books_df['description'].str.replace('\n\n', '\n')
books_df['description'] = books_df['description'].str.replace('\n', ' ')
books_df['description'] = books_df['description'].str.replace('   ', ' ')
books_df['description'] = books_df['description'].str.replace('  ', ' ')

goodreads_df = pd.read_csv('data/goodreads_library_export.csv')
goodreads_df['my_rating'] = goodreads_df['my_rating'].astype('UInt8')
books_df['my_rating'] = books_df['my_rating'].replace(0, np.nan)

books_df = books_df.merge(goodreads_df[['book_id', 'my_rating']], on='book_id', how='left')
books_df = books_df[~books_df['similar_books'].isna()].reset_index(drop=True)

In [None]:
# Prep embedding strings
def format_string_for_embedding(items, kind=None, truncate=0):
    if not isinstance(items, (list)) or len(items) == 0:
        return ""

    n = len(items)
    if n == 1:
        res = items[0]
    elif n > truncate > 1:
        res = f"{', '.join(items[:truncate])}, and {items[truncate]}"
    else:
        res = f"{', '.join(items[:-1])}{',' if n > 2 else ''} and {items[-1]}"
    
    prefix = f"{kind.capitalize()}{'s' if n > 1 else ''}: " if kind else ""
    return f"{prefix}{res}"

books_df['authors_post'] = books_df['authors'].str.split('|')
books_df['authors_post'] = books_df['authors_post'].apply(lambda x:format_string_for_embedding(x, truncate=4))

books_df['genres_post'] = books_df['genres'].str.split('|')
books_df['genres_post'] = books_df['genres_post'].apply(lambda x:format_string_for_embedding(x, kind='genre'))

books_df['desc_post'] = [[desc] if isinstance(desc, str) else [] for desc in books_df['description']]
books_df['desc_post'] = books_df['desc_post'].apply(lambda x:format_string_for_embedding(x, kind='description'))

def join_embedding_parts(title, authors, genres, desc):
    text = f"Book: {title}\n"
    if authors:
        text += f"Written by: {authors}\n"
    if genres:
        text += f"{genres}\n"
    if desc:
        text += f"{desc}" 
    return text

books_df['embedding_input'] = [
    join_embedding_parts(t, a, g, d) 
    for t, a, g, d in zip(books_df['title'], books_df['authors_post'], books_df['genres_post'], books_df['desc_post'])
]

id_to_string = books_df.set_index('book_id')['embedding_input']

In [None]:
# Embed sentences
import os
import ollama
from tqdm import tqdm
import torch

PARAMS = 0.6
OLLAMA_MODEL = f"qwen3-embedding:{PARAMS}b"
MIN_DIMENSIONS = 32

my_rating_count = (~books_df['my_rating'].isna()).sum()

embeddings_path = f'data/{PARAMS}b_embeddings.csv'
if os.path.exists(embeddings_path):
    embeddings = pd.read_csv(embeddings_path).set_index('book_id')
else:
    embeddings = pd.DataFrame()

current_ids = books_df['book_id'].values
missing_ids = [idx for idx in current_ids if idx not in embeddings.index]
if missing_ids:
    missing_strings = id_to_string.loc[missing_ids].tolist()
    batch_size = 128
    new_embeddings = []
    for i in tqdm(range(0, len(missing_strings), batch_size)):
        batch = missing_strings[i : i + batch_size]
        response = ollama.embed(model=OLLAMA_MODEL, input=batch)
        new_embeddings.extend(response['embeddings'])

    new_embeddings = pd.DataFrame(new_embeddings, index=missing_ids)
    new_embeddings.index.name = 'book_id'
    embeddings = pd.concat([embeddings, new_embeddings])
    embeddings.to_csv(embeddings_path)
    del new_embeddings, missing_strings, missing_ids, current_ids

embeddings = embeddings.loc[books_df['book_id']].values
embeddings = torch.tensor(embeddings, dtype=torch.float32)

In [None]:
# Build adjacency matrix
from torch_geometric.utils import add_self_loops
from torch_geometric.nn.conv.gcn_conv import gcn_norm

id_to_idx = {id: i for i, id in enumerate(books_df['book_id'])}

edge_indices = []
for idx, row in tqdm(books_df.iterrows(), total=len(books_df)):
    current_idx = id_to_idx[row['book_id']]
    if pd.isna(row['similar_books']):
        continue
    for item in row['similar_books'].split('|'):
        try:
            target_id = int(item.split(':')[0])
            if target_id in books_df['book_id'].values:
                target_idx = id_to_idx[target_id]
                edge_indices.append([current_idx, target_idx])
                edge_indices.append([target_idx, current_idx])
        except (ValueError, IndexError):
            continue

if not edge_indices:
    edge_index = torch.tensor([[], []], dtype=torch.long)
else:
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

# GCN normalization to balance message passing
edge_index_with_loops, _ = add_self_loops(edge_index, num_nodes=embeddings.size(0))
edge_index_norm, edge_weight_norm = gcn_norm(edge_index_with_loops, num_nodes=embeddings.size(0))
adj_matrix = torch.sparse_coo_tensor(edge_index_norm, edge_weight_norm, (embeddings.size(0), embeddings.size(0)))

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import ndcg_score, mean_squared_error
from scipy.stats import spearmanr
import nevergrad as ng


def objective(num_propagations, 
              pls_n_components, 
              knn_neighbors, knn_leaf_size,
              knn_weight, pls_weight,
              ):
    
    all_embeddings = precomputed_embeddings[num_propagations]
    my_ratings_embeddings = all_embeddings[my_ratings_mask]

    loo = LeaveOneOut()
    y_trues = []
    y_preds = []
    for train_idx, test_idx in loo.split(my_ratings_embeddings):
        X_train, X_test = my_ratings_embeddings[train_idx], my_ratings_embeddings[test_idx]
        y_train, y_test = my_ratings[train_idx], my_ratings[test_idx]

        knn = KNeighborsRegressor(
            n_neighbors=knn_neighbors,
            leaf_size=knn_leaf_size,
            metric='cosine', 
            weights='distance',
            n_jobs=-1,
            )
        knn.fit(X_train, y_train)
        knn_pred = knn.predict(X_test)

        n_comps = min(pls_n_components, len(X_train) - 1)
        pls = PLSRegression(
            n_components=n_comps,
            scale=False,
            )
        pls.fit(X_train, y_train)
        pls_pred = pls.predict(X_test).flatten()

        total_w = knn_weight + pls_weight + 1e-6
        final_pred = ((knn_weight * knn_pred) + (pls_weight * pls_pred)) / total_w

        y_trues.append(y_test[0])
        y_preds.append(final_pred[0])

    y_trues = np.array(y_trues)
    y_preds = np.array(y_preds)
    
    mse = mean_squared_error(y_trues, y_preds)
    ndcg = ndcg_score([y_trues], [y_preds])
    if np.std(y_preds) < 1e-9:
        spearman = 0.0
    else:
        spearman, _ = spearmanr(y_trues, y_preds)
        if np.isnan(spearman): spearman = 0.0

    return mse + (1.0 - spearman) + (1.0 - ndcg)


MAX_PROPAGATIONS = 3
propagated = embeddings.clone()
norm_l2 = Normalizer(norm='l2')
precomputed_embeddings = [norm_l2.transform(propagated.numpy())]

for _ in range(MAX_PROPAGATIONS):
    propagated = torch.sparse.mm(adj_matrix, propagated)
    precomputed_embeddings.append(norm_l2.transform(propagated.numpy()))
del propagated

my_ratings_mask = ~books_df['my_rating'].isna()
my_ratings = books_df.loc[my_ratings_mask, 'my_rating'].values


train_size = len(my_ratings)
parametrization = ng.p.Instrumentation(
    num_propagations = ng.p.Scalar(lower=0, upper=MAX_PROPAGATIONS).set_integer_casting(),
    pls_n_components = ng.p.Scalar(lower=1, upper=train_size).set_integer_casting(),
    knn_neighbors = ng.p.Scalar(lower=3, upper=train_size//2).set_integer_casting(),
    knn_leaf_size = ng.p.Scalar(lower=3, upper=50).set_integer_casting(),
    knn_weight = ng.p.Scalar(lower=0, upper=1),
    pls_weight = ng.p.Scalar(lower=0, upper=1),
)

BUDGET = 300
optimizer = ng.optimizers.NGOpt(parametrization=parametrization, budget=BUDGET)
best_loss = float('inf')
with tqdm(total=BUDGET) as pbar:
    for i in range(BUDGET):
        x = optimizer.ask()
        loss = objective(*x.args, **x.kwargs)
        optimizer.tell(x, loss)
        if loss < best_loss:
            best_loss = loss
        pbar.update(1)
        if i % 10 == 0:
            pbar.set_description(f"Best Loss: {best_loss:.4f}")

best_params = optimizer.provide_recommendation().kwargs

In [None]:
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ARDRegression
from sklearn.ensemble import RandomForestRegressor

train_mask = books_df['my_rating'].notna() & (books_df['my_rating'] > 0)
X_train = X_reduced[train_mask]
y_train = books_df.loc[train_mask, 'my_rating'].values

# Bayesian Ridge Regression
brr = BayesianRidge(compute_score=True)
brr.fit(X_train, y_train)
means, stds = brr.predict(X_reduced, return_std=True)
books_df['brr_pred_rating'] = means
books_df['brr_uncertainty'] = stds
books_df['brr_score'] = books_df['brr_pred_rating'] - (books_df['brr_uncertainty'])

nonzero_weights = np.sum(np.abs(brr.coef_) > 1e-5)
print(f"BRR used {nonzero_weights} of {X_reduced.shape[1]} dimensions.")

# Bayesian ARD
ard = ARDRegression(compute_score=True)
ard.fit(X_train, y_train)
means, stds = ard.predict(X_reduced, return_std=True)
books_df['brr_pred_rating'] = means
books_df['brr_uncertainty'] = stds
books_df['brr_score'] = books_df['brr_pred_rating'] - (books_df['brr_uncertainty'])

nonzero_weights = np.sum(np.abs(ard.coef_) > 1e-5)
print(f"ARD used {nonzero_weights} of {X_reduced.shape[1]} dimensions.")

# Random Forest
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=2, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
means = rf.predict(X_reduced)
per_tree_preds = np.stack([tree.predict(X_reduced) for tree in rf.estimators_])
stds = np.std(per_tree_preds, axis=0)
books_df['rf_pred_rating'] = means
books_df['rf_uncertainty'] = stds
books_df['rf_score'] = books_df['rf_pred_rating'] - (books_df['rf_uncertainty'])

# Weighted KNN Regressor
knn = KNeighborsRegressor(n_neighbors=15, weights='distance', metric='cosine')
knn.fit(X_train, y_train)
means = knn.predict(X_reduced)
neighbor_indices = knn.kneighbors(X_reduced, return_distance=False)
neighbor_ratings = y_train[neighbor_indices] 
stds = np.std(neighbor_ratings, axis=1)
books_df['knn_pred_rating'] = means
books_df['knn_uncertainty'] = stds
books_df['knn_score'] = books_df['knn_pred_rating'] - (books_df['knn_uncertainty'])

books_df

In [None]:
# Global score
C = books_df['avg_rating'].mean()
m = books_df['rating_count'].quantile(0.10) 
def weighted_rating(x, m=m, C=C):
    v = float(x['rating_count'])
    R = float(x['avg_rating'])
    if v == 0: 
        return C
    return (v / (v + m) * R) + (m / (v + m) * C)
books_df['global_score'] = books_df.apply(weighted_rating, axis=1)

In [None]:
import nevergrad as ng
import numpy as np
import umap
import torch
from sklearn.model_selection import KFold
from sklearn.linear_model import ARDRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import ndcg_score, mean_squared_error
from scipy.stats import rankdata
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


def objective(
    # GCN feature blending (now with 4 levels)
    gcn_propagations
    
    # UMAP hyperparameters
    umap_neighbors,
    umap_min_dist,
    umap_components,
    umap_learning_rate,
    umap_negative_sample_rate,
    
    # Model hyperparameters
    knn_k,
    knn_weights,
    
    ard_alpha_1,
    ard_alpha_2,
    ard_lambda_1,
    ard_lambda_2,
    ard_threshold_lambda,
    
    # Ensemble strategy
    ensemble_method,
    uncertainty_transform,
    
    # Ensemble weights
    w_ard, w_knn,
    
    # Uncertainty penalties
    u_ard, u_knn,
):
    
    # Cast to appropriate types
    umap_neighbors = max(5, int(umap_neighbors))
    umap_components = max(2, int(umap_components))
    knn_k = max(3, int(knn_k))
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []
    for _, (train_idx, val_idx) in enumerate(kf.split(X_raw_emb_train)):
        try:
            # 1. Blend GCN features
            for _ in range(gcn_propagations):
                final_embeddings = torch.sparse.mm(adj_matrix, final_embeddings)
            final_embeddings = final_embeddings.numpy()
            
            X_emb_tr, X_emb_val = final_embeddings[train_idx], final_embeddings[val_idx]
            y_tr, y_val = y_train_base[train_idx], y_train_base[val_idx]
            
            max_components = min(umap_components, len(y_tr) - 2)

            # 2. Normalize and scale embeddings
            l2_norm = Normalizer(norm='l2')
            X_emb_tr = l2_norm.transform(X_emb_tr)
            X_emb_val = l2_norm.transform(X_emb_val)
            
            # 3. UMAP
            reducer = umap.UMAP(
                n_neighbors=min(umap_neighbors, len(y_tr) - 1),
                min_dist=umap_min_dist,
                n_components=max_components,
                metric='cosine',
                learning_rate=umap_learning_rate,
                negative_sample_rate=int(umap_negative_sample_rate),
                random_state=42,
                n_jobs=1,
                verbose=False
            )
            X_tr_final = reducer.fit_transform(X_emb_tr)
            X_val_final = reducer.transform(X_emb_val)
            
            # 4. Train models
            # ARD with hyperparameters
            ard = ARDRegression(
                alpha_1=ard_alpha_1,
                alpha_2=ard_alpha_2,
                lambda_1=ard_lambda_1,
                lambda_2=ard_lambda_2,
                threshold_lambda=ard_threshold_lambda,
                compute_score=True
            )
            ard.fit(X_tr_final, y_tr)
            mu_ard, std_ard = ard.predict(X_val_final, return_std=True)
            
            # KNN with extended options
            knn = KNeighborsRegressor(
                n_neighbors=min(knn_k, len(y_tr) - 1), 
                weights=knn_weights,
                metric='cosine',
            )
            knn.fit(X_tr_final, y_tr)
            mu_knn = knn.predict(X_val_final)
            neigh_idx = knn.kneighbors(X_val_final, return_distance=False)
            std_knn = np.std(y_tr[neigh_idx], axis=1)
            
            # 5. Transform uncertainties
            if uncertainty_transform == 'sqrt':
                std_ard = np.sqrt(std_ard)
                std_knn = np.sqrt(std_knn)
            elif uncertainty_transform == 'square':
                std_ard = std_ard ** 2
                std_knn = std_knn ** 2
            
            # 6. Ensemble predictions
            if ensemble_method == 'weighted':
                final_pred = (w_ard * (mu_ard - u_ard * std_ard) + 
              w_knn * (mu_knn - u_knn * std_knn)) / (w_ard + w_knn + 1e-6)
                # final_pred = np.sum(preds, axis=0)
            
            elif ensemble_method == 'rank':
                # # Rank-based combination
                # ranks = [
                #     w_ard * rankdata(mu_ard - u_ard * std_ard),
                #     w_knn * rankdata(mu_knn - u_knn * std_knn),
                # ]
                # final_pred = np.sum(ranks, axis=0)

                # Rank-based combination
                r_ard = rankdata(mu_ard - u_ard * std_ard)
                r_knn = rankdata(mu_knn - u_knn * std_knn)
                
                # Normalize ranks to 0-1 range to be comparable to y_val for MSE check
                r_ard = r_ard / len(r_ard)
                r_knn = r_knn / len(r_knn)
                
                final_pred = (w_ard * r_ard + w_knn * r_knn) / (w_ard + w_knn + 1e-6)
            
            else:  # 'average'
                final_pred = (mu_ard + mu_knn) / 2
            
            # 7. Multi-objective scoring: NDCG + MSE
            ndcg = ndcg_score([y_val], [final_pred], k=min(20, len(y_val)))
            mse = mean_squared_error(y_val, final_pred)
            
            # Combined score (NDCG primary, MSE secondary)
            score = ndcg - 0.1 * np.sqrt(mse)
            fold_scores.append(score)
            
        except Exception as e:
            return 1e6
    
    if len(fold_scores) == 0:
        return 1e6
    
    return -np.mean(fold_scores)


# DATA PREPARATION
if isinstance(embeddings, torch.Tensor):
    X_raw_emb = embeddings.detach().cpu().numpy()
else:
    X_raw_emb = np.array(embeddings)

# Extract training data
y_full = books_df['my_rating'].values.astype(float)
valid_mask = (~np.isnan(y_full)) & (y_full > 0)
y_train_base = y_full[valid_mask]
scaler = MinMaxScaler()
y_train_base = scaler.fit_transform(y_train_base.reshape(-1, 1)).ravel()
n_train_samples = len(y_train_base)

max_umap_components = min(int((n_train_samples * 2 / 3)), sent_embedding_dimensions)
parametrization = ng.p.Instrumentation(
    # GCN blending (4 depths now)
    gcn_weight_0=ng.p.Scalar(lower=0, upper=10),
    gcn_weight_1=ng.p.Scalar(lower=0, upper=10),
    gcn_weight_2=ng.p.Scalar(lower=0, upper=10),
    gcn_weight_3=ng.p.Scalar(lower=0, upper=10),
    
    # UMAP with more hyperparameters
    umap_neighbors=ng.p.Scalar(lower=5, upper=50).set_integer_casting(),
    umap_min_dist=ng.p.Scalar(lower=0.0, upper=0.99),
    umap_components=ng.p.Scalar(lower=3, upper=max_umap_components).set_integer_casting(),
    umap_learning_rate=ng.p.Scalar(lower=0.1, upper=2.0),
    umap_negative_sample_rate=ng.p.Scalar(lower=3, upper=20).set_integer_casting(),
    
    # KNN hyperparameters
    knn_k=ng.p.Scalar(lower=3, upper=30).set_integer_casting(),
    knn_weights=ng.p.Choice(['uniform', 'distance']),
    
    # ARD hyperparameters
    ard_alpha_1=ng.p.Scalar(lower=1e-7, upper=1e-3),
    ard_alpha_2=ng.p.Scalar(lower=1e-7, upper=1e-3),
    ard_lambda_1=ng.p.Scalar(lower=1e-7, upper=1e-3),
    ard_lambda_2=ng.p.Scalar(lower=1e-7, upper=1e-3),
    ard_threshold_lambda=ng.p.Scalar(lower=1e3, upper=1e6),
    
    # Ensemble strategy
    ensemble_method=ng.p.Choice(['weighted', 'rank', 'average']),
    uncertainty_transform=ng.p.Choice(['none', 'sqrt']),
    
    # Ensemble weights (6 models now)
    w_ard=ng.p.Scalar(lower=0, upper=2),
    w_knn=ng.p.Scalar(lower=0, upper=2),
    
    # Uncertainty penalties
    u_ard=ng.p.Scalar(lower=0, upper=5),
    u_knn=ng.p.Scalar(lower=0, upper=5),
)

BUDGET = 300
optimizer = ng.optimizers.NGOpt(parametrization=parametrization, budget=BUDGET)
with tqdm(total=BUDGET) as pbar:
    for i in range(BUDGET):
        x = optimizer.ask()
        loss = objective(*x.args, **x.kwargs)
        optimizer.tell(x, loss)
        pbar.update(1)
        if i % 20 == 0:
            pbar.set_description(f"Best Score: {-optimizer.current_bests['minimum'].mean:.4f}")

best_params = optimizer.provide_recommendation().kwargs
param_groups = {
    'GCN Blending': ['gcn_weight_0', 'gcn_weight_1', 'gcn_weight_2', 'gcn_weight_3'],
    'UMAP': [k for k in best_params.keys() if k.startswith('umap_')],
    'KNN': [k for k in best_params.keys() if k.startswith('knn_')],
    'ARD': [k for k in best_params.keys() if k.startswith('ard_')],
    'Ensemble': ['ensemble_method', 'uncertainty_transform'] + [k for k in best_params.keys() if k.startswith('w_') or k.startswith('u_')]
}

In [None]:
import networkx as nx

G = nx.Graph()
G.add_nodes_from(books_df['book_id'].tolist())

# 2. Build edges with correct type casting
edge_list = []
for idx, row in books_df.iterrows():
    if pd.isna(row['similar_books']): continue
    
    for item in row['similar_books'].split('|'):
        target_id_str = item.split(':')[0]
        target_id = int(target_id_str)
        
        if target_id in books_df['book_id'].values:
            edge_list.append((row['book_id'], target_id))

G.add_edges_from(edge_list)

# old

In [None]:
goodreads_export = pd.read_csv('data/goodreads_library_export.csv')
# goodreads_export = pd.read_csv('data/20-01-2025_goodreads_library_export.csv')
# goodreads_export = goodreads_export[['Book Id', 'Author', 'My Rating', 'Number of Pages', 'Original Publication Year']]
# goodreads_export = goodreads_export.rename(columns={'Book Id':'book_id',
#                                                     'Author': 'author',
#                                                     'My Rating': 'my_rating',
#                                                     'Number of Pages': 'num_pages',
#                                                     'Original Publication Year': 'year'})
threshold = (goodreads_export['number_of_pages'].mean() - goodreads_export['number_of_pages'].std())
goodreads_export.loc[goodreads_export['number_of_pages'] < threshold, 'number_of_pages'] = np.nan

book_df = pd.read_csv('data/books.csv')
# book_df = pd.read_csv('data/01-2025_goodreads_scraped.csv')
df = goodreads_export.merge(book_df, on='book_id')

# Drop competing columns
df['author'] = df['author_x'].fillna(df['author_y'])
df['num_pages'] = df['num_pages_x'].fillna(df['num_pages_y'])
df['year'] = df['year_x'].fillna(df['year_y'])
df.drop(columns=['author_x', 'author_y', 'num_pages_x', 'num_pages_y', 'year_x', 'year_y'], inplace=True)

df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['year'] = df['year'].fillna(df['year'].mean()).round().astype(int)
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].mean()).round().astype(int)
df['num_reviews'] = df['num_reviews'].fillna(0).round().astype(int)
df['my_rating'] = df['my_rating'].replace(0,np.nan)

df['age'] = int(datetime.now().strftime('%Y')) - df['year']
df['average_rating'] = ((df['5 stars'] * 5) + (df['4 stars'] * 4) + (df['3 stars'] * 3) + (df['2 stars'] * 2) + df['1 star']) / df['num_ratings']
df = df[['book_id', 'title', 'author', 'year', 'age', 'series', 'num_pages', 'genres', 'num_ratings', 'num_reviews', 'my_rating', 'average_rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star']]

In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([a, b, c])

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']
coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['a'] - df['a'].min()
df['2a_shifted'] = df['2a_shifted'] * (1 / df['2a_shifted'].max()) + 1
df['b_shifted'] = df['b'] - df['b'].min()
df['b_shifted'] = df['b_shifted'] * (1 / df['b_shifted'].max()) + 1
df['c_shifted'] = df['c'] - df['c'].min()
df['c_shifted'] = df['c_shifted'] * (1 / df['c_shifted'].max()) + 1

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['coeff_2a_rating'] = (df['num_adjusted_rating'] * df['2a_shifted'])
df['coeff_b_rating'] = (df['num_adjusted_rating']) / (df['b_shifted'])
df['coeff_c_rating'] = (df['num_adjusted_rating'] * df['c_shifted'])
df['joined_rating'] = (df['num_adjusted_rating'] * df['c_shifted'] * df['2a_shifted']) / df['b_shifted']
df['final_rating'] = df['joined_rating'] - (df['joined_rating'] - df['joined_rating'].mean()) / df['num_ratings_ln']

df['num_adjusted_page_rating'] = df['num_adjusted_rating'] / (df['num_pages_ln'])
df['coeff_2a_page_rating'] = df['coeff_2a_rating'] / df['num_pages_ln']
df['coeff_b_page_rating'] = df['coeff_b_rating'] / df['num_pages_ln']
df['coeff_c_page_rating'] = df['coeff_c_rating'] / df['num_pages_ln']
df['joined_page_rating'] = df['joined_rating'] / df['num_pages_ln']
df['final_page_rating'] = df['joined_page_rating'] - (df['joined_page_rating'] - df['joined_page_rating'].mean()) / df['num_ratings_ln']

In [None]:
numeric_cols = ['age', 'num_pages', 'num_pages_ln', 'num_ratings', 'num_ratings_ln', 'num_reviews', 'my_rating', 'average_rating', '1 star', '2 stars', '3 stars', '4 stars', '5 stars', '1_star_percentage', '2_star_percentage', '3_star_percentage', '4_star_percentage', '5_star_percentage', 'a', 'b', 'c', 'num_adjusted_rating', 'coeff_2a_rating', 'coeff_b_rating', 'coeff_c_rating', 'joined_rating', 'final_rating', 'num_adjusted_page_rating', 'coeff_2a_page_rating', 'coeff_b_page_rating', 'coeff_c_page_rating', 'joined_page_rating', 'final_page_rating']
corr_df= df[numeric_cols].corr()

plt.figure(figsize=(20, 15)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
fresh = df.sort_values(by='final_page_rating', ascending=False).reset_index().drop('index', axis=1)
fresh = fresh[fresh['my_rating'].isna()]
fresh[['Fiction' in genre_list for genre_list in fresh['genres']]] # Fiction, Nonfiction, Memoir, Classics, History, Politics, Philosophy, Business