In [2]:
%load_ext kedro.ipython

In [3]:
%reload_kedro

In [4]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go # For the waterfall plot -- cannot easily do this in matplotlib
from typing import List, Dict, Tuple, Any



In [5]:
context.catalog.list()


[1m[[0m
    [32m'books_raw'[0m,
    [32m'books_loaded'[0m,
    [32m'publisher_consolidation'[0m,
    [32m'price_by_isbn_input'[0m,
    [32m'price_by_isbn'[0m,
    [32m'title_embeddings_original'[0m,
    [32m'authors_embeddings'[0m,
    [32m'description_embeddings'[0m,
    [32m'open_library_book_api_info_input'[0m,
    [32m'open_library_book_api_info'[0m,
    [32m'filtered_books'[0m,
    [32m'exclusions_summary'[0m,
    [32m'scope_waterfall_plot'[0m,
    [32m'parameters'[0m,
    [32m'params:color_gr_brown'[0m,
    [32m'params:color_gr_purple'[0m,
    [32m'params:color_gr_green'[0m,
    [32m'params:color_gr_tan_background'[0m,
    [32m'params:title_column'[0m,
    [32m'params:author_column'[0m,
    [32m'params:description_column'[0m,
    [32m'params:original_data_key_column'[0m
[1m][0m

In [26]:
books_input = catalog.load('books_loaded')
price_by_isbn = catalog.load('price_by_isbn')
title_embeddings = catalog.load('title_embeddings_original')
book_api_data = catalog.load('open_library_book_api_info')
authors_embeddings = catalog.load('authors_embeddings')

# Get the color values
color_gr_brown = catalog.load("params:color_gr_brown")
color_gr_purple = catalog.load("params:color_gr_purple")
color_gr_green = catalog.load("params:color_gr_green")
color_gr_tan_background = catalog.load("params:color_gr_tan_background")

# Add in the price data
books_input = books_input.merge(price_by_isbn, on='isbn13', how='left')

# Purpose of this notebook 

## This NB is an exploratory data analysis part of the project.
We'll suggest some hypothesis and try to verify them through analysis 

In [7]:
data = books_input

In [29]:
df = data.merge(title_embeddings)


In [30]:
df = data.merge(authors_embeddings)

In [31]:
df.head(5)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Price,PriceStrategy,authors_embedding
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,12.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,99.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,24.95,LIST_PRICE,"[-0.22978533804416656, 0.1457509994506836, -0...."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,7.99,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,40.95,LIST_PRICE,"[-0.30315670371055603, 0.11502666771411896, -0..."


In [72]:
df['authors'].nunique()

[1;36m6639[0m

# First hypothesis 

By design, an author might be related to the rating of a book. For example, Charles Dickens wrote classics that might be just all rated the same. So the author's name's by default a significant element to the average rating of a book. Let's try and proove this assumption 

In [33]:
first_hypodf=df[['authors', 'average_rating', 'authors_embedding']]

In [34]:
first_hypodf.describe()

Unnamed: 0,average_rating
count,11123.0
mean,3.934075
std,0.350485
min,0.0
25%,3.77
50%,3.96
75%,4.14
max,5.0


## First thing we notice 
Those books aren't very bad! As the mean is 4 out of 5, we can assume that we might not have much many lame books... That's a first bias we definitely need to handle

## Hypothesis 1.a / Embeddings catch correct relations

By computing the cosine similarity we can prove or forget the embedding technique

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
"""
Trying to see if the embedding catches similarities between different instances of a same author
"""

one = first_hypodf['authors_embedding'].iloc[0]
two = first_hypodf['authors_embedding'].iloc[2]

In [52]:
one = one.reshape(1,-1)
two = two.reshape(1,-1)

In [53]:
cosine_similarity(one,two)

[1;35marray[0m[1m([0m[1m[[0m[1m[[0m[1;36m0.75045805[0m[1m][0m[1m][0m[1m)[0m

A cosine similarity of 0.75 is very good for two instances fairly different. We can use this technique to merge names together. For example we can set a threshold of 70% and basically say that if the cosine similarity of two embeddings is greater or equal to 70 the author is the same or has some differences (a collaborator like the Mary GrandPré which is the illustrator of the latests HP books.)

In [56]:
# Basically converting all authors embedding to match a cosine computation

first_hypodf.loc[:, 'authors_embedding'] = first_hypodf['authors_embedding'].apply(lambda x: x.reshape(1, -1))

In [66]:
def standardize_author_names(df, threshold=0.72):
    embeddings = first_hypodf['authors_embedding'].tolist()
    authors = first_hypodf['authors'].tolist()
    
    
    embeddings = [embedding.flatten() for embedding in embeddings]
    
    for i in range(len(embeddings)):
        current_embedding = embeddings[i].reshape(1, -1)
        similarities = cosine_similarity(current_embedding, embeddings)
        
        for j in range(len(similarities[0])):
            if i != j and similarities[0][j] >= threshold:
                authors[i] = authors[j]
                break
    
    first_hypodf.loc[:, 'authors'] = authors

In [67]:
standardize_author_names(first_hypodf)

In [71]:
first_hypodf['authors'].nunique()

[1;36m1567[0m

We went from 6639 authors name to 1567 in a robust manner this is actually very good to analyse with ease 