In [653]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
books = pd.read_csv("//content/drive/MyDrive/Exam/BX-Books.csv", encoding='latin-1', delimiter=';', error_bad_lines=False)
ratings = pd.read_csv ("/content/drive/MyDrive/Exam/BX-Book-Ratings.csv",encoding='latin-1', delimiter=';', error_bad_lines=False)
users = pd.read_csv ("/content/drive/MyDrive/Exam/BX-Users.csv",encoding='latin-1', delimiter=';', error_bad_lines=False)              


In [655]:
#Making column names first letter be a capital letter and replace - with _

books.columns = books.columns.str.strip().str.title().str.replace('-', '_')
users.columns = users.columns.str.strip().str.title().str.replace('-', '_')
ratings.columns = ratings.columns.str.strip().str.title().str.replace('-', '_')

In [656]:
age_null = users['Age'].isnull().sum() # Sums up the 1's returned by the isnull() mask
all_users = users['User_Id'].count() # Counts the number of cells in the series (wihout the NaNs)

print(f'There are {age_null} empty age values in our set of {all_users} users (or {(age_null/all_users)*100:.2f}%).')

There are 110762 empty age values in our set of 278858 users (or 39.72%).


In [657]:
#Merging the dataframes into a new dataframe to search for the best book/country

books_by_country = pd.merge(books, ratings, on = 'Isbn', how='inner')
books_by_country = pd.merge( books_by_country, users, on ='User_Id', how = 'inner')

In [None]:
!pip install beautifulsoup4

from bs4 import BeautifulSoup

#I'm creating a function to convert HTML entities to Unicode characters

def convert_html_entities(text):
  # BeautifulSoup object with the input text and the 'html.parser' parser

    soup = BeautifulSoup(text, 'html.parser')

  #I'm using the get_text() method to extract the text without HTML tags or entities

    return soup.get_text()

books_by_country['Location'] = books_by_country['Location'].apply(convert_html_entities)

In [659]:
#Removing the * symbol from our Location column

books_by_country['Location'] = books_by_country['Location'].str.replace('*', '')

  books_by_country['Location'] = books_by_country['Location'].str.replace('*', '')


In [660]:
#Splitting the Location column into 3 columns so the data is easier to filter and read

user_location_expanded = books_by_country['Location'].str.split(',', 2, expand=True)
user_location_expanded.columns = ['City', 'State', 'Country']
books_by_country = books_by_country.join(user_location_expanded)

  user_location_expanded = books_by_country['Location'].str.split(',', 2, expand=True)


In [661]:
# Dropping the location column since we no longer need it

books_by_country = books_by_country.drop(column ='Location', axis = 1)

In [662]:
#We have too many Na values in our Country column

books_by_country = books_by_country['Country'].fillna('Unknown')

In [None]:
#Most liked books by Country
#top_ratings = books_by_country.sort_values('Book_Rating', ascending=False)[['Book_Title', 'Country']].head(50)
top_ratings = books_by_country.sort_values('Book_Rating', ascending=False).drop_duplicates('Country')[['Book_Title', 'Country']].head(50)
top_ratings

In [664]:
#Checking the ratings and grouping the data in the ratings df

#This is a very important step in our data
user_groupby = ratings.groupby("User_Id") 
book_groupby = ratings.groupby("Isbn")


average_user_rating = user_groupby["Book_Rating"].mean() # This is basicly the avg number of ratings by users
number_of_ratings_by_user = user_groupby["Book_Rating"].count() # This is the number of ratings by users

average_book_rating = book_groupby["Book_Rating"].mean() # Avg book rating
number_of_book_ratings = book_groupby["Book_Rating"].count() # The number of book ratings

#Changing the name of the columns
average_user_rating.name = "Avg_rating"
number_of_ratings_by_user.name = "N_ratings"
average_book_rating.name = "Avg_rating"
number_of_book_ratings.name = "N_ratings"

In [665]:
#Merging the data into our main dataframes

users = users.join(number_of_ratings_by_user, on="User_Id")
users = users.join(average_user_rating, on="User_Id")
books = books.join(number_of_book_ratings, on="Isbn")
books = books.join(average_book_rating, on="Isbn")


In [666]:
#Cleaning the data in the N_ratings columns for our books and users dataframes
users["N_ratings"] = users["N_ratings"].fillna(0)
books["N_ratings"] = books["N_ratings"].fillna(0)

#users['N_ratings'].sort_values() -> Float, lets make it astype(int64)

#Chaging the datatype from float to int so we can work easier

users["N_ratings"] = users["N_ratings"].astype("int64")
books["N_ratings"] = books["N_ratings"].astype("int64")

In [None]:
#The location column from Users its not that important because we are running a website, not a local shop
#Clients can purchase whatever book they like. 
#We will just filter the most and best rated books with the most and best rated authors

#books.info()


In [None]:
# Since we already have J.K Rowling as an author on our website 
# Lets see all the Harry Potter books and editions written by Rowling

books[books["Book_Title"].str.contains("Harry Potter") & books["Book_Author"].str.contains("Rowling")]

In [None]:
#Now that we have the Avg_Rating of each book, lets find the most liked ones

books[['Book_Title', 'Book_Author','N_ratings','Avg_rating']].nlargest(1000, 'Avg_rating')

In [None]:

#Considering the fact that these books have a single rating and it's a 10/10, we must find the most ratings with the best rating

#These are the best rated books but they don't have that many N_Ratings

top_rated_books = books[books['Avg_rating'] > 5]
top_rated_books = top_rated_books.nlargest(100, 'N_ratings')
top_rated_books

In [None]:
# Lets see 20 best rated books in our dataset

best_20_books = books.loc[books["N_ratings"] > 20].sort_values(by="Avg_rating", ascending=False).head(20)
best_20_books

In [None]:
#Lets find the most rated authors 

most_rated_authors = books.nlargest(30,'N_ratings')
most_rated_authors

# We found the most rated authors, but the most rated one has an Avg_Rating of 1.01 so we should not put his books on our website

In [None]:
#Lets find the best rated authors with the most N_Ratings

filtered_books = books.nlargest(60,'N_ratings')

best_rated_authors = filtered_books.groupby('Book_Author')['Avg_rating'].mean().nlargest(60).reset_index(name='Avg_rating')
best_rated_authors

#Knowing how many N_Ratings the books have, we can assume that these books are the most bought too!

In [None]:
#Using API to find a description for our best_rated_authors

!pip install Wikipedia-API
import wikipediaapi
import requests


wiki_api = wikipediaapi.Wikipedia('en')


# Function to fetch author description from Wikipedia API
def fetch_author_description(author):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{author}"
    try:
        response = requests.get(url)
        response_json = response.json()
        description = response_json.get('extract')
        return description
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching description for {author}: {e}")
        return None

# Adding a new column 'Author-Description' to the dataframe
best_rated_authors['Author_Description'] = best_rated_authors['Book_Author'].apply(fetch_author_description)

best_rated_authors

In [None]:
#Using a function to get the book description

def fetch_book_description(title):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    try:
        response = requests.get(url)
        response_json = response.json()
        description = response_json.get('extract')
        return description
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching description for {title}: {e}")
        return None

# Adding a new column 'Book-Description' to the dataframe
top_rated_books['Book_Description'] = top_rated_books['Book_Title'].apply(fetch_book_description)

print(top_rated_books)

In [676]:
#Trying another function to see if we get more descriptions for our books

def get_book_description(title):
    page = wiki_api.page(title)
    if page.exists():
        return page.summary
    else:
        return "Description not found"

# Add a new column for book descriptions
top_rated_books['Book-Description'] = top_rated_books['Book_Title'].apply(get_book_description)

# Display the updated dataframe
print(top_rated_books[['Book_Title', 'Book_Description']])

                                              Book_Title  \
5506   Harry Potter and the Order of the Phoenix (Boo...   
3354   The Hobbit : The Enchanting Prelude to The Lor...   
6330   Harry Potter and the Prisoner of Azkaban (Book 3)   
3839   Harry Potter and the Prisoner of Azkaban (Book 3)   
2232       Ender's Game (Ender Wiggins Saga (Paperback))   
...                                                  ...   
7582   Narcissus in Chains (Anita Blake Vampire Hunte...   
15511                      Complete Chronicles of Narnia   
19879  The Kingdom by the Sea: A Journey Around the C...   
65641                               The Narrows: A Novel   
3003                                          Truth, The   

              Book_Description  
5506                      None  
3354                      None  
6330                      None  
3839                      None  
2232                      None  
...                        ...  
7582                      None  
15511          

In [677]:
# get_book_description function got me some more descriptions for our Book-Description column.

top_books_desc = top_rated_books[top_rated_books['Book_Description'] != 'Description not found']
top_books_desc

Unnamed: 0,Isbn,Book_Title,Book_Author,Year_Of_Publication,Publisher,Image_Url_S,Image_Url_M,Image_Url_L,N_ratings,Avg_rating,Book_Description,Book-Description
5506,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,334,5.571856,,Description not found
3354,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339681.0...,http://images.amazon.com/images/P/0345339681.0...,http://images.amazon.com/images/P/0345339681.0...,281,5.007117,,Description not found
6330,0439136369,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,http://images.amazon.com/images/P/0439136369.0...,http://images.amazon.com/images/P/0439136369.0...,226,5.345133,,Description not found
3839,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,197,6.467005,,Description not found
2232,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,http://images.amazon.com/images/P/0812550706.0...,http://images.amazon.com/images/P/0812550706.0...,195,5.302564,,Description not found
...,...,...,...,...,...,...,...,...,...,...,...,...
7582,0425181685,Narcissus in Chains (Anita Blake Vampire Hunte...,Laurell K. Hamilton,2001,Berkley Publishing Group,http://images.amazon.com/images/P/0425181685.0...,http://images.amazon.com/images/P/0425181685.0...,http://images.amazon.com/images/P/0425181685.0...,29,5.448276,,Description not found
15511,0066238501,Complete Chronicles of Narnia,C. S. Lewis,2001,Harpercollins Juvenile Books,http://images.amazon.com/images/P/0066238501.0...,http://images.amazon.com/images/P/0066238501.0...,http://images.amazon.com/images/P/0066238501.0...,29,5.862069,,Description not found
19879,0140071814,The Kingdom by the Sea: A Journey Around the C...,Paul Theroux,1995,Penguin Books,http://images.amazon.com/images/P/0140071814.0...,http://images.amazon.com/images/P/0140071814.0...,http://images.amazon.com/images/P/0140071814.0...,29,5.344828,,Description not found
65641,0316155306,The Narrows: A Novel,Michael Connelly,2004,"Little, Brown",http://images.amazon.com/images/P/0316155306.0...,http://images.amazon.com/images/P/0316155306.0...,http://images.amazon.com/images/P/0316155306.0...,29,5.517241,,Description not found
