In [1]:
#importing necessary modules that will be used to open, read, and modify the database.
import json
import gzip
import os
import pandas as pd
import numpy as np
import warnings
import re
import urllib
#removing unneccessary warnings to clear up console
def removeWarning():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    removeWarning()

In [2]:
link = "http://pythonpedia.in/goodreads_books_children.json.gz"
f = urllib.urlopen(link)
myfile = f.read()
with gzip.open(myfile, 'r') as f: #opening the gzip file and reading it
    line = f.readline()

In [3]:
#defining what columns are necessary (this only reads the first line of the json file)
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], #the ID of the book
        "title": data["title_without_series"], #the title of the book without mentioning the name of the series
        "ratings": data["ratings_count"], #the number of ratings
        "url": data["url"], #the goodreads url of the book
        "cover_image": data["image_url"], #the url of the image
        "description": data["description"] #the description of the book
    }

In [4]:
book_titles = []
with gzip.open(myfile, 'r') as f: #reopening the gzip file for saving it in an array.
    while True: #looping till break is called
        line = f.readline()
        if not line: #checking if there are any more lines left in the file
            break #stopping the loop if there are none
        fields = parse_fields(line) #converting from json to pandas dataframe
        try:
            ratings = int(fields["ratings"]) #converting the ratings into integer form if some are not in integer form
        except ValueError: #removing the data if cannot be converted into integer form nd the compiler ran into an error
            continue
        if ratings > 15: #removing the books which have lesser than 15 ratings since we do not have enough data on them
            book_titles.append(fields)

In [5]:
#applying necessary changes to the DataFrame for the data to be just as we need for display and modification
titles = pd.DataFrame.from_dict(book_titles)
new_titles = titles
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
"""removing uneccessary characters from the title and keeping only A-Z and 0-9 characters and saving it in mod_title"""
titles["mod_title"] = titles["mod_title"].str.lower() #lowering the case of the modified title so that each title can be compared while performing a search for better results
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)
titles = titles[titles["mod_title"].str.len() > 0] #removing the titles which are blank
titles.sort_values(by=["ratings"]) #sorting by the number of ratings recieved

#removing the duplicate titles
titles.drop_duplicates(subset=["title"], keep = False, inplace=True) 
titles.drop_duplicates(subset=["mod_title"], keep = False, inplace=True)
print(titles)
titles["book_id"]

       book_id                                           title  ratings  \
0       287141                   The Aeneid for Boys and Girls       46   
2        89378                                      Dog Heaven     1331   
4      2592648          It's Funny Where Ben's Train Takes Him       21   
5      3631900             Amadi's Snowman: A Story of Reading       44   
6      8030991                            Katso eteesi, Lotta!       34   
...        ...                                             ...      ...   
82135  2437551                      Little Whistle's Christmas       52   
82136  4462101                       Gus Was A Christmas Ghost       19   
82138   823094                  To Root, to Toot, to Parachute      240   
82141   331839  Jacqueline Kennedy Onassis: Friend of the Arts       18   
82142  2342551        The Children's Classic Poetry Collection       36   

                                                     url  \
0      https://www.goodreads.com/book/s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles.drop_duplicates(subset=["title"], keep = False, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles.drop_duplicates(subset=["mod_title"], keep = False, inplace=True)


0         287141
2          89378
4        2592648
5        3631900
6        8030991
          ...   
82135    2437551
82136    4462101
82138     823094
82141     331839
82142    2342551
Name: book_id, Length: 58703, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer #importing the module that will allow us to do machine learning

In [7]:
#creating the first feature: similar books, based on the description

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["description"])#creating a vector to initialize machine learning based on the description of the book

pd.options.display.max_colwidth = 5000 # making panda display 5000 characters of the description in the database

vectorizer2 = TfidfVectorizer()
tfidf2 = vectorizer2.fit_transform(titles["title"])


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import re
def get_description_recos(book_id, vectorizer, description): #the function that uses a similarity function
    y = description
    vectorized_q = vectorizer.transform([y])
    similarity = cosine_similarity(vectorized_q, tfidf).flatten() #finding the similarity between the book's description and other book's description
    #this finds how similar one book is to the other
    titles["similarity"] = similarity
    indices = np.argpartition(similarity, -10)[-10:] #displaying the top 10 similar books
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending = False)
    results.drop(results[results["similarity"] >=0.9].index, inplace=True) #removing the books which have 90%+ similarity since they tend to be the same book
    return results.head(10)
theStartingTitles = pd.read_csv('starter_books.csv') #reading the books that have been selected to be displayed originally

def searchClosestTitle(title, vectorizer):
    y = title
    vectorized_q = vectorizer2.transform([y])
    Sim = cosine_similarity(vectorized_q, tfidf2).flatten()
    indices = np.argpartition(Sim, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return results.head(10)

    



In [9]:
try:
    x = pd.read_csv('likedBooks.csv') #checking if a likedBooks database already exists
    y = x.set_axis(['count', 'book_id'], axis=1, copy = False) #swapping the column headings since they were labled wrongly
    liked_books = set(y['book_id'])
except:
    liked_books = set() #if a liked books database does not exist, create a set which will be stored later into likedBooks.csv

In [10]:
def runFurther(): #running the third feature
    %run furtherRecos.ipynb 
    return recos


In [11]:

pip install Pillow==9.0.0

Collecting Pillow==9.0.0
  Downloading Pillow-9.0.0-cp310-cp310-macosx_11_0_arm64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: Pillow
  Attempting uninstall: Pillow
    Found existing installation: Pillow 9.3.0
    Uninstalling Pillow-9.3.0:
      Successfully uninstalled Pillow-9.3.0
Successfully installed Pillow-9.0.0
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [12]:
pip install tkscrolledframe

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
from tkinter import *
from tkinter import messagebox
import tkinter as tk
import urllib.request
from PIL import Image, ImageTk
import io
from tkscrolledframe import ScrolledFrame
import requests
from tkinter import ttk
import threading
class App(tk.Frame):
    def __init__(self, master=None):
        super().__init__(master)
        self.master = master
        self.master.geometry("2560x1600")
        self.grid(pady = 10)
        self.master.grid_columnconfigure(0, weight=1)
        self.create_widgets()
        self.the_books = new_titles
        self.master.title("Book Recommendation System - Computer Science Internal Assessment")


    def create_widgets(self):
        #Create the back button
        self.back_button = tk.Button(self, text="Back to Home", command=self.back)
        self.back_button.grid(column=2, row=0, pady=10)
        self.showLikedBtn = tk.Button(root, text = "View Liked Books", command = self.showLiked)
        self.showLikedBtn.grid(column = 1, row = 0, pady = 10, sticky = "ne")
        #opening the images for the home page
        Img = Image.open("like.png")
        Img = Img.resize((300,300), Image.LANCZOS)
        self.likeImg = ImageTk.PhotoImage(Img, master = self)   
        Img1 = Image.open("singleBk.png")
        Img1 = Img1.resize((300,300), Image.LANCZOS)
        self.likeImg1 = ImageTk.PhotoImage(Img1, master = self)
        Img2 = Image.open("bkreco.png")
        Img2 = Img2.resize((300,300), Image.LANCZOS)
        self.likeImg2 = ImageTk.PhotoImage(Img2, master = self)
        # Create the three centered buttons
        self.button1 = tk.Button(self, text="Like more books", command = self.load_like_grid, image = self.likeImg, compound = TOP)
        self.button1.grid(column=1, row=1, padx=10, pady=10)
        self.button1.grid_columnconfigure(0, weight = 1)
        self.button2 = tk.Button(self, text="View similar books", command=self.load_desc_grid, image = self.likeImg1, compound = TOP)
        self.button2.grid(column=2, row=1, padx=10, pady=10)
        self.button2.grid_columnconfigure(0, weight = 1)
        self.button3 = tk.Button(self, text="View recommendations based on liked books", command = self.load_furtherRecos, image = self.likeImg2, compound = TOP)
        self.button3.grid(column=3, row=1, padx=10, pady=10)
        self.button3.grid_columnconfigure(0, weight = 1)

        # Create the "Search" button below the three buttons
        self.search_button = tk.Button(self, text="Search", command=self.load_search)
        self.search_button.grid(column=2, row=2, pady=10)
    def load_image(self, url):   #to load the images of the books
        with urllib.request.urlopen(url) as u:
            raw_data = u.read()
        im = Image.open(io.BytesIO(raw_data))
        self.theImage = ImageTk.PhotoImage(im)
        return self.theImage
    def load_desc_grid(self):
        # Create a new page with a grid of 50 buttons
        self.clear_widgets()
        self.scroll_frame = ScrolledFrame(self, height=self.winfo_screenheight()-200, width = self.winfo_screenheight()-15)
        self.scroll_frame.grid(column = 2, row = 1)
        self.scroll_frame.bind_arrow_keys(root)
        self.scroll_frame.bind_scroll_wheel(root)
        self.inner_frame = self.scroll_frame.display_widget(Frame)

        for i in range(50):
            button = tk.Button(self.inner_frame, text=theStartingTitles["title"][i], compound = "top", bg = 'systemTransparent')
            url = theStartingTitles["cover_image"][i]
            name = theStartingTitles["title"][i]
            image = self.load_image(url)
            button.grid(column=i%5, row=(i+5)//5, padx=5, pady=5)
            button.config(command=lambda button=[name, url, i]: self.load_desc_subpage(button[0], button[1], button[2]), image = image)
            button.image = image
            button.config(wraplength = image.width(), justify="left")
    def load_like_grid(self):
        self.clear_widgets()
        self.scroll_frame = ScrolledFrame(self, height=self.winfo_screenheight()-200, width = self.winfo_screenheight()-15)
        self.scroll_frame.grid(column = 2, row = 1)
        self.scroll_frame.bind_arrow_keys(root)
        self.scroll_frame.bind_scroll_wheel(root)
        self.inner_frame = self.scroll_frame.display_widget(Frame)

        for i in range(50):
            button = tk.Button(self.inner_frame, text=theStartingTitles["title"][i], compound = "top", highlightthickness=0, bg= 'blue')
            url = theStartingTitles["cover_image"][i]
            image = self.load_image(url)
            button.grid(column=i%5, row=(i+5)//5, padx=5, pady=5)
            button.config(command=lambda button=[button, i]: self.addToLiked(button[1]), image = image)
            button.image = image
            button.config(wraplength = image.width(), justify="left")
        tk.Button(self.inner_frame, text = "Done!", bg = "green", command = self.onDone).grid(row=11, column=0)
    def onDone(self):
        pd.DataFrame(liked_books).to_csv('likedBooks.csv') #storing the liked books list into a database
        self.clear_widgets()
        self.create_widgets()
    
    def addToLiked(self, i):
        liked_books.add(theStartingTitles["book_id"][i])
    def removeFromLiked(self, i):
        liked_books.remove(theStartingTitles["book_id"][i])
    def addItToLiked(self, id):
        liked_books.add(str(id))
    def removeItFromLiked(self, id):
        try:
            liked_books.remove(int(id))
        except:
            print("id: " + str(id))
            print(liked_books)
            
    def load_new_subpage(self, name, url, id):
        self.clear_widgets()
        title = name
        image = self.load_image(url)
        description = self.the_books.loc[self.the_books["book_id"] == id]["description"].to_string()
        x = self.the_books.loc[self.the_books["book_id"] == id]["book_id"].to_string().split(" ")[-1]
        recommendations = get_description_recos(x, vectorizer, description)
        rec_title = recommendations["title"]
        # Add heading and image
        tk.Label(self, text=title, font=("Helvetica", 18, "bold")).grid(row=0, column=0, padx=20, pady=20, sticky='w')
        tk.Label(self, image=image).grid(row=1, column=0, padx=20, pady=20, sticky='w')
        Img = Image.open("like.png")
        Img = Img.resize((50,50), Image.LANCZOS)
        self.likeImg = ImageTk.PhotoImage(Img, master = self)
        self.addToLikedBtn = tk.Button(self, text = "Add to liked books", image = self.likeImg, command = TOP)

        self.addToLikedBtn.grid(row = 2, column = 1, padx = 20, pady = 20)
        self.addToLikedBtn.config(command = lambda:self.addItToLiked(id))
        DisImg = Img.rotate(180)
        self.dislikeImg = ImageTk.PhotoImage(DisImg, master = self)
        self.removeFromLikedBtn = tk.Button(self, text = "Remove from liked books", image = self.dislikeImg, command = TOP)
        self.removeFromLikedBtn.grid(row = 2, column = 2, padx = 20, pady = 20)
        self.removeFromLikedBtn.config(command = lambda:self.removeItFromLiked(id))
        # Add dummy text
        tk.Label(self, text=description, wraplength = 500, justify = 'left').grid(row=1, column=1, padx=20, pady=20, sticky='w')
        
        # Add recommendation list
        tk.Label(self, text="Recommendation Titles", font=("Helvetica", 16, "bold")).grid(row=2, column=0, padx=20, pady=20, sticky='w')
        x = 0
        for i in rec_title:
            new_Btn = tk.Button(self, text = str(i))
            new_Btn.grid(row=x+3, column=0, padx=20, pady=0, sticky='w')
            new_Btn.config(command = lambda new = [str(i), recommendations["cover_image"].iloc[x], recommendations["book_id"].iloc[x]]:self.load_new_subpage(new[0], new[1], new[2]))
            x = x+1
    
    def load_desc_subpage(self, name, url, num):
        self.clear_widgets()
        title = name
        image = self.load_image(url)
        description = theStartingTitles["description"][num]
        currTitle = theStartingTitles["title"][num]
        x = theStartingTitles.loc[theStartingTitles["title"] == currTitle]["book_id"].to_string().split(" ")[-1]
        recommendations = get_description_recos(x, vectorizer, theStartingTitles["description"][num])
        rec_title = recommendations["title"]
        # Add heading and image
        tk.Label(self, text=title, font=("Helvetica", 18, "bold")).grid(row=0, column=0, padx=20, pady=20, sticky='w')
        tk.Label(self, image=image).grid(row=1, column=0, padx=20, pady=20, sticky='w')
        Img = Image.open("like.png")
        Img = Img.resize((50,50), Image.LANCZOS)
        self.likeImg = ImageTk.PhotoImage(Img, master = self)
        self.addToLikedBtn = tk.Button(self, text = "Add to liked books", image = self.likeImg, command = TOP)
        self.addToLikedBtn.grid(row = 2, column = 1, padx = 20, pady = 20)
        self.addToLikedBtn.config(command = lambda:self.addToLiked(num))
        disImg = Img.rotate(180)
        self.dislikeImg = ImageTk.PhotoImage(disImg, master = self)
        self.removeFromLikedBtn = tk.Button(self, text = "Remove from liked books", image = self.dislikeImg, command = TOP)
        self.removeFromLikedBtn.grid(row = 2, column = 2, padx = 20, pady = 20)
        self.removeFromLikedBtn.config(command = lambda:self.removeFromLiked(num))
        # Add dummy text
        tk.Label(self, text=description, wraplength = 500, justify = 'left').grid(row=1, column=1, padx=20, pady=20, sticky='w')
        
        # Add recommendation list
        tk.Label(self, text="Recommendation Titles", font=("Helvetica", 16, "bold")).grid(row=2, column=0, padx=20, pady=20, sticky='w')
        x = 0
        for i in rec_title:
            new_Btn = tk.Button(self, text = str(i))
            new_Btn.grid(row=x+3, column=0, padx=20, pady=0, sticky='w')
            new_Btn.config(command = lambda new = [str(i), recommendations["cover_image"].iloc[x], recommendations["book_id"].iloc[x]]:self.load_new_subpage(new[0], new[1], new[2]))
            x = x+1
    # Load the further recommendations in a separate thread
    def load_recommendations(self, y):
        x = runFurther()
        print(x)
        for i in range(len(x)):
            title = x["title"].iloc[i]
            self.button = tk.Button(self, text=title, command=lambda i=i: self.load_new_subpage(x["title"].iloc[i], x["cover_image"].iloc[i], x["book_id"].iloc[i]))
            self.button.grid(column=2, row=i+1, padx=10, pady=5)
        y.destroy()
    def load_furtherRecos(self):
        self.clear_widgets()
        self.loading_label = tk.Label(self, text="Loading further recommendations, Please Wait...")
        self.loading_label.grid(column=2, row=1, columnspan=3, padx=10, pady=10)
        self.after(200, lambda: self.load_recommendations(self.loading_label))

   
    def load_search(self):
        # Create a new page with a search bar and a submit button
        self.clear_widgets()
        self.search_bar = tk.Entry(self)
        self.search_bar.grid(column=1, row=1, padx=10, pady=10)
        self.submit_button = tk.Button(self, text="Submit", command=lambda: self.load_results(self.search_bar.get()))
        self.submit_button.grid(column=2, row=1, padx=10, pady=10)

    def load_results(self, title):
        # Create a list of 10 further buttons
        self.clear_widgets()
        x = searchClosestTitle(title, vectorizer2)
        for i in range(len(x)):
            button = tk.Button(self, text=x["title"].iloc[i])
            button.grid(row = 1+i, column = 2, pady=5)
            button.config(command = lambda new = [x["title"].iloc[i], x["cover_image"].iloc[i], x["book_id"].iloc[i]]:self.load_new_subpage(new[0], new[1], new[2]))

    def clear_widgets(self):
        # Helper function to clear all widgets from the frame
        for widget in self.grid_slaves():
            if(widget != self.back_button):
                widget.destroy()
                self.back_button.grid(column=2, row=0, pady=10)
    def back(self):
        self.clear_widgets()
        self.create_widgets()

    def showLiked(self):
        self.clear_widgets()
        books = []

        with gzip.open(myfile, 'r') as f: #reopening the gzip file for saving it in an array.
            while True: #looping till break is called
                line = f.readline()
                if not line: #checking if there are any more lines left in the file
                    break #stopping the loop if there are none
                fields = parse_fields(line) #converting from json to pandas dataframe
                books.append(fields)
        all_books = pd.DataFrame.from_dict(books)
        all_books["book_id"] = all_books["book_id"].astype(str)
        we = list(liked_books)
        theLiked = [str(i) for i in we]
        all_liked = all_books[all_books["book_id"].isin(theLiked)]
        for i in range(len(all_liked)):
            title = all_liked["title"].iloc[i]
            print(all_liked["book_id"].iloc[i])
            self.button = tk.Button(self, text=title, command=lambda i=i: self.load_new_subpage(all_liked["title"].iloc[i], all_liked["cover_image"].iloc[i], all_liked["book_id"].iloc[i]))
            self.button.grid(column=2, row=i+1, padx=10, pady=5)
        if(len(all_liked) == 0):
            tk.Label(self, text = "You have to like some books first!").grid(column = 2, row = 1)
            tk.Button(self, text = "Click to like books", command = self.load_like_grid).grid(column = 2, row = 2)

    


# Create the main window and start the event loop

root = tk.Tk()
app = App(master=root)
app.mainloop()
