This notebook is used for cleaning the Kaggle data set to a format that can be inputted for for the TF-IDF.

In [39]:
import pandas as pd

#data_file = "C:\\Users\\sadha\\Downloads\\books_1.Best_Books_Ever.csv"
#extract the data set + convert to data frame
data_file = 'data/books_1.Best_Books_Ever_2.csv'
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",...,07/11/60,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,2269402,23328,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"['Classics', 'Fiction', 'Romance', 'Historical...","['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",...,01/28/13,[],2998241,"['1617567', '816659', '373311', '113934', '767...",94.0,"['United Kingdom', 'Derbyshire, England (Unite...",https://i.gr-assets.com/images/S/compressed.ph...,1983116,20452,
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,10/05/05,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1


In [44]:
#clean and remove unecessary columns (only bookid, title, description should be extracted)

# 1) reload the Kaggle books data
books = pd.read_csv('data/books_1.Best_Books_Ever_2.csv')

# 2) keep only what we care about *for now*
books = books[["bookId", "title", "description", "genres"]].copy()

# 3) rename description to something consistent if you want
books = books.rename(columns={"description": "blurb"})

# 4) clean the dataframe

# 4.1) remove the text from the book id column so that '2767052-the-hunger-games' becomes '2767052'
id_slug = books["bookId"].str.extract(
    r'^(?P<goodreads_id>\d+)[\.-]?(?P<slug>.*)$'
)


books["goodreads_id"] = id_slug["goodreads_id"].astype("int64")
books["slug"] = (
    id_slug["slug"]
    .str.replace(r'^[-\.]+', '', regex=True)
    .str.replace('_', ' ')
    .str.strip()
)
#4.2 format the book blurbs to be all lower case
books["blurb"] = books["blurb"].astype(str).str.lower()

# 5) construct new clean dataframe -> keep only numeric id, title, and blurb
books = books[["goodreads_id", "title", "blurb", "genres"]].copy()

In [45]:
#print a sample of the data frame
books.head(5)

Unnamed: 0,goodreads_id,title,blurb,genres
0,2767052,The Hunger Games,winning means fame and fortune.losing means ce...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas..."
1,2,Harry Potter and the Order of the Phoenix,there is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',..."
2,2657,To Kill a Mockingbird,the unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ..."
3,1885,Pride and Prejudice,alternate cover edition of isbn 9780679783268s...,"['Classics', 'Fiction', 'Romance', 'Historical..."
4,41865,Twilight,about three things i was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire..."


In [46]:
#convert to csv
books.to_csv("clean_best_books.csv", index=False)

In [48]:
genres = books["genres"].copy()
genres.to_csv("genres.csv", index=False)