In [2]:
import pandas as pd

### Extract books complete metadata, including description

In [51]:
books_full_metadata = pd.read_csv('books_1.Best_Books_Ever.csv')
books_full_metadata.head(1)
len(books_full_metadata)

52478

Keep only boks that are in english

In [52]:
books_full_metadata = books_full_metadata[books_full_metadata["language"] == "English"]
len(books_full_metadata)

42661

Get only the features we are interested in

In [53]:
books_full_metadata = books_full_metadata[["bookId","title","series","author","description","genres","pages", "publisher","firstPublishDate","awards","setting","coverImg"]]

Parse titles

In [54]:
books_full_metadata['mod_title'] = books_full_metadata['title'].str.replace("\s+", " ", regex=True) #Remove multiple spaces in a row
books_full_metadata['mod_title'] = books_full_metadata['mod_title'].str.replace("[^\w\s]", "", regex=True).str.lower() #Remove punctuation and change to lower case
books_full_metadata[books_full_metadata["mod_title"] == "mobydick or the whale"]

Unnamed: 0,bookId,title,series,author,description,genres,pages,publisher,firstPublishDate,awards,setting,coverImg,mod_title
100,153747.Moby_Dick_or_the_Whale,"Moby-Dick or, the Whale",,"Herman Melville, Andrew Delbanco (Introduction...","""It is the horrible texture of a fabric that s...","['Classics', 'Fiction', 'Literature', 'Adventu...",654,Penguin Classics,10/18/51,['Audie Award for Solo Narration - Male (2006)...,"['Nantucket Island, Massachusetts (United Stat...",https://i.gr-assets.com/images/S/compressed.ph...,mobydick or the whale


Aggregate books by titles, only keep the one 

In [77]:
duplicate_book = books_full_metadata[books_full_metadata["mod_title"].isin(books_full_metadata["mod_title"][books_full_metadata["mod_title"].duplicated()])].sort_values("mod_title")
duplicate_book[duplicate_book["mod_title"].str.contains("dune")]

Unnamed: 0,bookId,title,series,author,description,genres,pages,publisher,firstPublishDate,awards,setting,coverImg,mod_title


In [69]:
len(books_full_metadata["mod_title"].unique())

40351

### Extract goodreads books incomplete metadata

In [27]:
books_partial_metadata = pd.read_json("books_titles.json")
books_partial_metadata["book_id"] = books_partial_metadata["book_id"].astype(str)
books_partial_metadata.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls


In [28]:
books_partial_metadata[books_partial_metadata["mod_title"] == "mobydick or the whale"]

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
44444,18306730,"Moby-Dick; or, The Whale",127,https://www.goodreads.com/book/show/18306730-m...,https://images.gr-assets.com/books/1376941676m...,mobydick or the whale
405862,7848,"Moby-Dick; or, The Whale",511,https://www.goodreads.com/book/show/7848.Moby_...,https://s.gr-assets.com/assets/nophoto/book/11...,mobydick or the whale
407952,6795393,"Moby-Dick: or, The Whale",30,https://www.goodreads.com/book/show/6795393-mo...,https://s.gr-assets.com/assets/nophoto/book/11...,mobydick or the whale
516352,3685701,"Moby-Dick; or, The Whale",66,https://www.goodreads.com/book/show/3685701-mo...,https://images.gr-assets.com/books/1297636408m...,mobydick or the whale
709163,9209129,"Moby-Dick; or, The Whale",6,https://www.goodreads.com/book/show/9209129-mo...,https://s.gr-assets.com/assets/nophoto/book/11...,mobydick or the whale
722571,583661,"Moby-Dick: or, The Whale",7,https://www.goodreads.com/book/show/583661.Mob...,https://s.gr-assets.com/assets/nophoto/book/11...,mobydick or the whale
851550,437183,"Moby-Dick; or, The Whale",93,https://www.goodreads.com/book/show/437183.Mob...,https://s.gr-assets.com/assets/nophoto/book/11...,mobydick or the whale
1046140,1702019,"Moby-Dick; or, The Whale",107,https://www.goodreads.com/book/show/1702019.Mo...,https://images.gr-assets.com/books/1382758877m...,mobydick or the whale
1202461,524311,"Moby-Dick; or, The Whale",247,https://www.goodreads.com/book/show/524311.Mob...,https://images.gr-assets.com/books/1309285999m...,mobydick or the whale
1258421,6453877,"Moby-Dick; or, the Whale",533,https://www.goodreads.com/book/show/6453877-mo...,https://images.gr-assets.com/books/1325275362m...,mobydick or the whale


In [29]:
len(books_partial_metadata["mod_title"].unique())

1227673

### Import mapping between book ids in the csv and books_titles.json file

In [30]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as file: #Reading through large file
    next(file) #Skip header
    while (line := file.readline().rstrip()):
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

### Find the intersetion between the two book descriptions dataframes

In [31]:
books_intersection_full_partial = pd.merge(books_full_metadata, books_partial_metadata, how ='inner', on =['mod_title'])

In [32]:
books_intersection_full_partial.head(1)
# pd.options.display.max_colwidth = 200
# books_metadata[["title", "url"]][books_metadata["title"].str.contains("Harry Potter")]

Unnamed: 0,bookId,title_x,series,author,description,genres,pages,publisher,firstPublishDate,awards,setting,coverImg,mod_title,book_id,title_y,ratings,url,cover_image
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,Scholastic Press,,['Locus Award Nominee for Best Young Adult Boo...,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,the hunger games,14796360,The Hunger Games,11,https://www.goodreads.com/book/show/14796360-t...,https://images.gr-assets.com/books/1355036953m...


In [34]:
intersection_book_id = set(books_intersection_full_partial["book_id"])

### Extract book ratings for books we know the descritption

In [35]:
known_book_ratings = []
i=0
with open("goodreads_interactions.csv", 'r') as file:
    next(file) #Skip header
    while (line := file.readline().rstrip()):
        #Retrieve user, book id and associated rating
        user_id, csv_book_id, _, rating, _ = line.split(",")
        book_id = csv_book_mapping.get(csv_book_id)
        if book_id in intersection_book_id:
            known_book_ratings.append([user_id, book_id, rating])
        i+=1
        if(i%5000000==0):
            print(f"{round(i/229000000*100,1)}% completed")

2.2% completed
4.4% completed
6.6% completed
8.7% completed
10.9% completed
13.1% completed
15.3% completed
17.5% completed
19.7% completed
21.8% completed
24.0% completed
26.2% completed
28.4% completed
30.6% completed
32.8% completed
34.9% completed
37.1% completed
39.3% completed
41.5% completed
43.7% completed
45.9% completed
48.0% completed
50.2% completed
52.4% completed
54.6% completed
56.8% completed
59.0% completed
61.1% completed
63.3% completed
65.5% completed
67.7% completed
69.9% completed
72.1% completed
74.2% completed
76.4% completed
78.6% completed
80.8% completed
83.0% completed
85.2% completed
87.3% completed
89.5% completed
91.7% completed
93.9% completed
96.1% completed
98.3% completed


In [36]:
len(known_book_ratings) 
#63825044

63825044

In [37]:
users_ratings = pd.DataFrame(known_book_ratings, columns=["user_id", "book_id", "rating"])
users_ratings["rating"] = pd.to_numeric(users_ratings["rating"])
users_ratings.head()

Unnamed: 0,user_id,book_id,rating
0,0,21,5
1,0,30,5
2,0,1022863,5
3,0,830,4
4,0,835,4


### Export the dataframes to csv

Export the user ratings of books we know the description of

In [None]:
users_ratings.to_csv("./data/")

In [19]:
rated_book_list = pd.DataFrame(users_ratings["book_id"].unique(), columns=["book_id"])
rated_book_list = rated_book_list.set_index("book_id")

In [20]:
rated_book_list
#224722: nb of books in intersection
#165245: nb of rated books

21
30
1022863
830
835
...
2839417
16067588
17978792
20574290
1610407


In [92]:
rated_known_books = pd.merge(rated_book_list, books_metadata, how ='inner', on =['book_id'])

In [95]:
rated_known_books

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,21,A Short History of Nearly Everything,196205,https://www.goodreads.com/book/show/21.A_Short_History_of_Nearly_Everything,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,a short history of nearly everything
1,30,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings,92172,https://www.goodreads.com/book/show/30.J_R_R_Tolkien_4_Book_Boxed_Set,https://images.gr-assets.com/books/1346072396m/30.jpg,jrr tolkien 4book boxed set the hobbit and the lord of the rings
2,1022863,Dune,206,https://www.goodreads.com/book/show/1022863.Dune,https://images.gr-assets.com/books/1464616838m/1022863.jpg,dune
3,830,Snow Crash,179029,https://www.goodreads.com/book/show/830.Snow_Crash,https://images.gr-assets.com/books/1477624625m/830.jpg,snow crash
4,835,"Harrington on Hold 'em: Expert Strategy for No-Limit Tournaments, Volume I: Strategic Play",1654,https://www.goodreads.com/book/show/835.Harrington_on_Hold_em,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,harrington on hold em expert strategy for nolimit tournaments volume i strategic play
...,...,...,...,...,...,...
226946,17610556,الاعمال الشعرية 1,8,https://www.goodreads.com/book/show/17610556-1,https://images.gr-assets.com/books/1364829484m/17610556.jpg,1
226947,17978792,Schooled,14,https://www.goodreads.com/book/show/17978792-schooled,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,schooled
226948,20574290,The Structure of Scientific Revolutions,6,https://www.goodreads.com/book/show/20574290-the-structure-of-scientific-revolutions,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,the structure of scientific revolutions
226949,1610407,Collected Poems,6,https://www.goodreads.com/book/show/1610407.Collected_Poems,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,collected poems
