# KEDRO SETUP + IMPORT

In [56]:
%load_ext kedro.ipython

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [57]:
%reload_kedro

In [58]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go # For the waterfall plot -- cannot easily do this in matplotlib
from typing import List, Dict, Tuple, Any



In [59]:
context.catalog.list()


[1m[[0m
    [32m'books_raw'[0m,
    [32m'books_loaded'[0m,
    [32m'publisher_consolidation'[0m,
    [32m'price_by_isbn_input'[0m,
    [32m'price_by_isbn'[0m,
    [32m'title_embeddings_original'[0m,
    [32m'authors_embeddings'[0m,
    [32m'description_embeddings'[0m,
    [32m'open_library_book_api_info_input'[0m,
    [32m'open_library_book_api_info'[0m,
    [32m'filtered_books'[0m,
    [32m'exclusions_summary'[0m,
    [32m'scope_waterfall_plot'[0m,
    [32m'parameters'[0m,
    [32m'params:title_column'[0m,
    [32m'params:author_column'[0m,
    [32m'params:description_column'[0m,
    [32m'params:original_data_key_column'[0m,
    [32m'params:color_gr_brown'[0m,
    [32m'params:color_gr_purple'[0m,
    [32m'params:color_gr_green'[0m,
    [32m'params:color_gr_tan_background'[0m
[1m][0m

# DATA IMPORT

In [60]:
books_input = catalog.load('books_loaded')
price_by_isbn = catalog.load('price_by_isbn')
title_embeddings = catalog.load('title_embeddings_original')
book_api_data = catalog.load('open_library_book_api_info')
authors_embeddings = catalog.load('authors_embeddings')

# Get the color values
color_gr_brown = catalog.load("params:color_gr_brown")
color_gr_purple = catalog.load("params:color_gr_purple")
color_gr_green = catalog.load("params:color_gr_green")
color_gr_tan_background = catalog.load("params:color_gr_tan_background")

# Add in the price data
books_input = books_input.merge(price_by_isbn, on='isbn13', how='left')

In [61]:
data = books_input

In [62]:
df = data.merge(title_embeddings)


In [63]:
df = data.merge(authors_embeddings)

# ANALYSIS & VIZUALISATION

## *INTRO*

In [64]:
df.head(5)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Price,PriceStrategy,authors_embedding
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,12.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,99.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,24.95,LIST_PRICE,"[-0.22978533804416656, 0.1457509994506836, -0...."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,7.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,40.95,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."


In [65]:
df['authors'].nunique()

[1;36m6639[0m

In [66]:
first_hypodf=df[['authors', 'average_rating', 'authors_embedding']]

In [67]:
first_hypodf.describe()

Unnamed: 0,average_rating
count,11123.0
mean,3.934075
std,0.350485
min,0.0
25%,3.77
50%,3.96
75%,4.14
max,5.0


### FIRST THING NOTICED 

THE DATA IS BIASED BY TE QUALITY OF BOOKS, HAVING A MEAN OF 3.8 OUT OF 5 MEANS THAT WE DON'T HAVE ENOUGH EXAMPLES OF NOT SO GOOD BOOKS.

## *CATCHING SEMANTIC RELATIONS : DO EMBEDDINGS WORK ?*

In [68]:
from sklearn.metrics.pairwise import cosine_similarity
"""
Trying to see if the embedding catches similarities between different instances of a same author
"""

one = first_hypodf['authors_embedding'].iloc[0]
two = first_hypodf['authors_embedding'].iloc[2]

one = one.reshape(1,-1)
two = two.reshape(1,-1)



In [71]:
cosine_similarity(one,two)

[1;35marray[0m[1m([0m[1m[[0m[1m[[0m[1;36m0.75045805[0m[1m][0m[1m][0m[1m)[0m

A cosine similarity of 0.75 is very good for two instances fairly different. We can use this technique to merge names together. For example we can set a threshold of 70% and basically say that if the cosine similarity of two embeddings is greater or equal to 70 the author is the same or has some differences (a collaborator like the Mary GrandPré which is the illustrator of the latests HP books.)

## *MERGING AUTHORS BASED ON THEIR EMBEDDINGS*

In [72]:
# Basically converting all authors embedding to match a cosine computation

first_hypodf.loc[:, 'authors_embedding'] = first_hypodf['authors_embedding'].apply(lambda x: x.reshape(1, -1))

In [75]:
def standardize_author_names(df, threshold=0.72):
    embeddings = first_hypodf['authors_embedding'].tolist()
    authors = first_hypodf['authors'].tolist()
    
    
    embeddings = [embedding.flatten() for embedding in embeddings]
    
    for i in range(len(embeddings)):
        current_embedding = embeddings[i].reshape(1, -1)
        similarities = cosine_similarity(current_embedding, embeddings)
        
        for j in range(len(similarities[0])):
            if i != j and similarities[0][j] >= threshold:
                authors[i] = authors[j]
                break
    
    first_hypodf.loc[:, 'authors'] = authors

In [76]:
standardize_author_names(first_hypodf)

In [77]:
first_hypodf['authors'].nunique()

[1;36m1567[0m

We went from 6639 authors name to 1567 in a robust manner this is actually very good to analyse with ease 

# FHDF, STARTING TO WORK WITH THE DATA FOR MODELING

In [123]:
fhdf=first_hypodf

In [124]:
fhdf

Unnamed: 0,authors,average_rating,authors_embedding,title,isbn13
0,J.K. Rowling/Mary GrandPré,4.57,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Half-Blood Prince (Harry ...,9780439785969
1,J.K. Rowling/Mary GrandPré,4.49,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Order of the Phoenix (Har...,9780439358071
2,J.K. Rowling/Mary GrandPré,4.42,"[[-0.22978533804416656, 0.1457509994506836, -0...",Harry Potter and the Chamber of Secrets (Harry...,9780439554893
3,J.K. Rowling/Mary GrandPré,4.56,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Prisoner of Azkaban (Harr...,9780439655484
4,J.K. Rowling/Mary GrandPré,4.78,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter Boxed Set Books 1-5 (Harry Potte...,9780439682589
...,...,...,...,...,...
11118,John Milton/Merritt Y. Hughes/David Scott Kastan,4.06,"[[-0.403737336397171, 0.2087896317243576, -0.1...",Expelled from Eden: A William T. Vollmann Reader,9781560254416
11119,Henry James/Gabriel Brownstein/Mary Cregan,4.08,"[[-0.5178908705711365, 0.5561336278915405, -0....",You Bright and Risen Angels,9780140110876
11120,Henry James/Gabriel Brownstein/Mary Cregan,3.96,"[[-0.5178909301757812, 0.5561334490776062, -0....",The Ice-Shirt (Seven Dreams #1),9780140131963
11121,Henry James/Gabriel Brownstein/Mary Cregan,3.72,"[[-0.5178908705711365, 0.5561336278915405, -0....",Poor People,9780060878825


In [125]:
fhdf.loc[:,'title']=df['title']

In [126]:
title_embeddings

Unnamed: 0,isbn13,title_embedding
0,9780439785969,"[-0.15156979858875275, 0.4935136139392853, 0.0..."
1,9780439358071,"[-0.22418256103992462, 0.20987719297409058, 0...."
2,9780439554893,"[-0.054178591817617416, 0.22646689414978027, -..."
3,9780439655484,"[-0.2782197892665863, 0.261374831199646, -0.12..."
4,9780439682589,"[-0.16637001931667328, 0.2694065272808075, -0...."
...,...,...
11118,9781560254416,"[-0.13434666395187378, 0.08153267949819565, 0...."
11119,9780140110876,"[0.32915258407592773, 0.15614797174930573, 0.0..."
11120,9780140131963,"[-0.02630615048110485, 0.1311979591846466, -0...."
11121,9780060878825,"[-0.10792334377765656, 0.27181512117385864, -0..."


In [127]:
fhdf.loc[:,'isbn13'] = df['isbn13']
fhdf = fhdf.merge(title_embeddings, on='isbn13', how='left')
fhdf.loc[:,'publisher'] = df['publisher']
fhdf.loc[:,'price'] = df['Price']

In [130]:
fhdf

Unnamed: 0,authors,average_rating,authors_embedding,title,isbn13,title_embedding,publisher,price
0,J.K. Rowling/Mary GrandPré,4.57,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Half-Blood Prince (Harry ...,9780439785969,"[-0.15156979858875275, 0.4935136139392853, 0.0...",Scholastic Inc.,12.990
1,J.K. Rowling/Mary GrandPré,4.49,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Order of the Phoenix (Har...,9780439358071,"[-0.22418256103992462, 0.20987719297409058, 0....",Scholastic Inc.,99.990
2,J.K. Rowling/Mary GrandPré,4.42,"[[-0.22978533804416656, 0.1457509994506836, -0...",Harry Potter and the Chamber of Secrets (Harry...,9780439554893,"[-0.054178591817617416, 0.22646689414978027, -...",Scholastic,24.950
3,J.K. Rowling/Mary GrandPré,4.56,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter and the Prisoner of Azkaban (Harr...,9780439655484,"[-0.2782197892665863, 0.261374831199646, -0.12...",Scholastic Inc.,7.990
4,J.K. Rowling/Mary GrandPré,4.78,"[[-0.30315670371055603, 0.11502666771411896, -...",Harry Potter Boxed Set Books 1-5 (Harry Potte...,9780439682589,"[-0.16637001931667328, 0.2694065272808075, -0....",Scholastic,40.950
...,...,...,...,...,...,...,...,...
11118,John Milton/Merritt Y. Hughes/David Scott Kastan,4.06,"[[-0.403737336397171, 0.2087896317243576, -0.1...",Expelled from Eden: A William T. Vollmann Reader,9781560254416,"[-0.13434666395187378, 0.08153267949819565, 0....",Da Capo Press,22.990
11119,Henry James/Gabriel Brownstein/Mary Cregan,4.08,"[[-0.5178908705711365, 0.5561336278915405, -0....",You Bright and Risen Angels,9780140110876,"[0.32915258407592773, 0.15614797174930573, 0.0...",Penguin Books,25.000
11120,Henry James/Gabriel Brownstein/Mary Cregan,3.96,"[[-0.5178909301757812, 0.5561334490776062, -0....",The Ice-Shirt (Seven Dreams #1),9780140131963,"[-0.02630615048110485, 0.1311979591846466, -0....",Penguin Books,24.000
11121,Henry James/Gabriel Brownstein/Mary Cregan,3.72,"[[-0.5178908705711365, 0.5561336278915405, -0....",Poor People,9780060878825,"[-0.10792334377765656, 0.27181512117385864, -0...",Ecco,29.950


# CORR COEF BETWEEN FEATURES

In [133]:
"""
Flattening the embedding to be able to work with them from now on!
"""
fhdf['authors_embedding'] = [np.array(embedding).flatten() for embedding in fhdf['authors_embedding']]

In [134]:
fhdf

Unnamed: 0,authors,average_rating,authors_embedding,title,isbn13,title_embedding,publisher,price
0,J.K. Rowling/Mary GrandPré,4.57,"[-0.30315670371055603, 0.11502666771411896, -0...",Harry Potter and the Half-Blood Prince (Harry ...,9780439785969,"[-0.15156979858875275, 0.4935136139392853, 0.0...",Scholastic Inc.,12.990
1,J.K. Rowling/Mary GrandPré,4.49,"[-0.30315670371055603, 0.11502666771411896, -0...",Harry Potter and the Order of the Phoenix (Har...,9780439358071,"[-0.22418256103992462, 0.20987719297409058, 0....",Scholastic Inc.,99.990
2,J.K. Rowling/Mary GrandPré,4.42,"[-0.22978533804416656, 0.1457509994506836, -0....",Harry Potter and the Chamber of Secrets (Harry...,9780439554893,"[-0.054178591817617416, 0.22646689414978027, -...",Scholastic,24.950
3,J.K. Rowling/Mary GrandPré,4.56,"[-0.30315670371055603, 0.11502666771411896, -0...",Harry Potter and the Prisoner of Azkaban (Harr...,9780439655484,"[-0.2782197892665863, 0.261374831199646, -0.12...",Scholastic Inc.,7.990
4,J.K. Rowling/Mary GrandPré,4.78,"[-0.30315670371055603, 0.11502666771411896, -0...",Harry Potter Boxed Set Books 1-5 (Harry Potte...,9780439682589,"[-0.16637001931667328, 0.2694065272808075, -0....",Scholastic,40.950
...,...,...,...,...,...,...,...,...
11118,John Milton/Merritt Y. Hughes/David Scott Kastan,4.06,"[-0.403737336397171, 0.2087896317243576, -0.12...",Expelled from Eden: A William T. Vollmann Reader,9781560254416,"[-0.13434666395187378, 0.08153267949819565, 0....",Da Capo Press,22.990
11119,Henry James/Gabriel Brownstein/Mary Cregan,4.08,"[-0.5178908705711365, 0.5561336278915405, -0.0...",You Bright and Risen Angels,9780140110876,"[0.32915258407592773, 0.15614797174930573, 0.0...",Penguin Books,25.000
11120,Henry James/Gabriel Brownstein/Mary Cregan,3.96,"[-0.5178909301757812, 0.5561334490776062, -0.0...",The Ice-Shirt (Seven Dreams #1),9780140131963,"[-0.02630615048110485, 0.1311979591846466, -0....",Penguin Books,24.000
11121,Henry James/Gabriel Brownstein/Mary Cregan,3.72,"[-0.5178908705711365, 0.5561336278915405, -0.0...",Poor People,9780060878825,"[-0.10792334377765656, 0.27181512117385864, -0...",Ecco,29.950


-> To correlate things together we should turn the embedding into a scalar

In [135]:
fhdf['mean_author_embedding'] = fhdf['authors_embedding'].apply(np.mean)
fhdf['mean_title_embedding'] = fhdf['title_embedding'].apply(np.mean)

In [136]:
fhdf['mean_author_embedding'].corr(fhdf['average_rating'])

[1;36m-0.05092749661907622[0m

# DATA PREP FOR MODELING

In [287]:
encoded_df = fhdf[["mean_author_embedding","authors_embedding", "price", 'publisher']]

In [288]:
encoded_df

Unnamed: 0,mean_author_embedding,authors_embedding,price,publisher
0,-0.001366,"[-0.30315670371055603, 0.11502666771411896, -0...",12.990,Scholastic Inc.
1,-0.001366,"[-0.30315670371055603, 0.11502666771411896, -0...",99.990,Scholastic Inc.
2,-0.000989,"[-0.22978533804416656, 0.1457509994506836, -0....",24.950,Scholastic
3,-0.001366,"[-0.30315670371055603, 0.11502666771411896, -0...",7.990,Scholastic Inc.
4,-0.001366,"[-0.30315670371055603, 0.11502666771411896, -0...",40.950,Scholastic
...,...,...,...,...
11118,0.000349,"[-0.403737336397171, 0.2087896317243576, -0.12...",22.990,Da Capo Press
11119,-0.000150,"[-0.5178908705711365, 0.5561336278915405, -0.0...",25.000,Penguin Books
11120,-0.000150,"[-0.5178909301757812, 0.5561334490776062, -0.0...",24.000,Penguin Books
11121,-0.000150,"[-0.5178908705711365, 0.5561336278915405, -0.0...",29.950,Ecco


In [289]:

#embeddings = np.array(encoded_df['authors_embedding'].tolist())
#embeddings_df = pd.DataFrame(embeddings, columns=[f'emb_{i}' for i in range(embeddings.shape[1])])

# Encodage One-Hot de la colonne 'publisher'
publisher_df = pd.get_dummies(encoded_df, columns=['publisher'])
publisher_df = publisher_df.drop(['authors_embedding', 'price', 'mean_author_embedding'], axis=1)

# Combiner toutes les features
features_df = pd.concat([encoded_df['mean_author_embedding'], encoded_df[['price']], publisher_df], axis=1)


X = features_df
y = fhdf['average_rating']


In [290]:
X = features_df
y = fhdf['average_rating']

In [291]:
print("Shape of X:", X.shape)
print("Length of y:", len(y))


Shape of X: (11123, 2292)
Length of y: 11123


In [292]:
# Convertir X et y en DataFrames
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y, columns=['average_rating'])

# Identifier les lignes contenant des NaN dans X
nan_indices = X_df[X_df.isna().any(axis=1)].index

# Supprimer les lignes correspondantes de X et y
X_df_cleaned = X_df.drop(nan_indices)
y_df_cleaned = y_df.drop(nan_indices)

# Assurez-vous que les indices sont maintenant alignés
X_cleaned = X_df_cleaned.values
y_cleaned = y_df_cleaned.values.ravel()

# Vérifier les formes après nettoyage
print("Shape of X_cleaned:", X_cleaned.shape)
print("Shape of y_cleaned:", y_cleaned.shape)

Shape of X_cleaned: (11112, 2292)
Shape of y_cleaned: (11112,)


# MODELING

In [278]:
import sklearn
from sklearn.model_selection import train_test_split

In [293]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned , y_cleaned , test_size=0.25)

In [294]:
from sklearn.neighbors import KNeighborsRegressor

In [295]:
reg = KNeighborsRegressor()

In [296]:
model = reg.fit(X_train, y_train)

In [297]:
model

In [298]:
y_pred = model.predict(X_test)

# RESULTS (METRICS & COMMENTARIES)

In [299]:
from sklearn.metrics import mean_squared_error, r2_score

mean_squared_error(y_test, y_pred) , r2_score(y_test, y_pred)

[1m([0m[1;36m1.1786912915166128[0m, [1;36m-0.09640319235479633[0m[1m)[0m