<a href="https://colab.research.google.com/github/DS-Jerry-in-Taiwan/project-set/blob/main/Goodreads_Analysis_and_Recommending_Books.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goodreads: Analysis and Recommending Books
---
The analysis and code refer to the [kaggle note](https://www.kaggle.com/hoshi7/goodreads-analysis-and-recommending-books).
We are going to cover the demographic below：
1. Does any relationship lie between ratings and the total ratings given?
2. Where do majority of the books lie, in terms of ratings - Does reading a book really bring forth bias for the ratings?
3. Do authors tend to perform same over time, with all their newer books? Or do they just fizzle out.
4. Do number of pages make an impact on reading styles, ratings and popularity?
Can books be recommended based on ratings? Is that a factor which can work?


# Agenda: What's the analysis aboout
---

In [None]:
!pip install "isbnlib"
!pip install newspaper3k
!pip install goodreads_api_client

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import isbnlib 
from newspaper import Article
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from progressbar import ProgressBar
import re
from scipy.cluster.vq import kmeans, vq
from pylab import plot, show
from matplotlib.lines import Line2D
import matplotlib.colors as mcolors
import goodreads_api_client as gr
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings
from google.colab import drive
warnings.filterwarnings('ignore')

drive.mount("/content/drive")

In [None]:
#Redirect the direction to the folder I choose in mydrive

main_path="/content/drive/MyDrive/adventure_time/book data/"
os.chdir(main_path)
#Display the objects on the direction
os.listdir()
zip_file = [x for x in os.listdir() if x.endswith("zip")][0]

In [None]:
# Unzip file to the path
import zipfile

path_to_zip_file = os.path.join(main_path,zip_file)

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    name=zip_ref.namelist()[0] #store the data in the zipfile
    print(name)
    zip_ref.extractall(os.getcwd())


## Get basic Ideas

In [None]:
#Read file on the direction
filepath = path_to_zip_file = os.path.join(main_path,name)
filepath
df = pd.read_csv(filepath,error_bad_lines =False)

In [None]:
# Find out the shape of dataset
print("the row number: {}\n the column number: {}".format(df.shape[0],df.shape[1]))

#Check the head of dataset
df.head()

In [None]:
# To replace the J.K. Rowling/Mary GrandPré
df.replace(to_replace="J.K. Rowling/Mary GrandPré",value="J.K. Rowling",inplace=True)
df.head()

## Columns Description:

- bookID Contains the unique ID for each book/series
- title contains the titles of the books
- authors contains the author of the particular book
- average_rating the average rating of the books, as decided by the users
- ISBN ISBN(10) number, tells the information about a book - such as edition and publisher
- ISBN 13 The new format for ISBN, implemented in 2007. 13 digits
language_code Tells the language for the books
- Num_pages Contains the number of pages for the book
- Ratings_count Contains the number of ratings given for the book
text_reviews_count Has the count of reviews left by users

# Data Exploration Analysis
---

## Which are the most occurances in the list

In [None]:
#plot the occurances of top 20
sns.set_context('poster')
plt.figure(figsize=(20,20))
books = df['title'].value_counts()[:20]
books

sns.barplot(x=books,y=books.index,palette='deep')
plt.title("Most Occurring Books")
plt.xlabel("Number of occurances")
plt.ylabel("Books")


## What's the distribution of langueges

In [None]:
sns.set_context('paper')
# plt.subplots(figsize=(50,20))
plt.figure(figsize=(50,20))
# plt.rcParams["figure.figsize"] = [10, 10]
ax=df.groupby('language_code')['title'].count().plot.bar()
plt.title('Language Code')
plt.xticks(fontsize = 30)
for p in ax.patches:
  ax.annotate(str(p.get_height()), (p.get_x()-0.3, p.get_height()+100),
   fontsize = 20, color = 'k')

# ax = df.groupby('language_code')['title'].count().plot.bar()
# plt.title('Language Code')
# plt.xticks(fontsize = 15)
# for p in ax.patches:
#     ax.annotate(str(p.get_height()), (p.get_x()-0.3, p.get_height()+100))


## what are the top 10 rated books
---

In [None]:
most_rated=df.sort_values("ratings_count",ascending=False).head(10).set_index("title")
plt.figure(figsize=(30,30))
plt.yticks(fontsize=20)
sns.barplot(most_rated['ratings_count'],most_rated.index,palette='rocket')

## Who are the authors with 10 top most books? 
---

In [None]:
plt.figure(figsize=(30,30))
# sns.set_context('talk')
most_book = df.groupby('authors')['title'].count().reset_index().sort_values("title",ascending=False).head(10).set_index("authors")
# most_book_index = list(most_book.groupby('authors')['title'].count().index)
ax = sns.barplot(most_book['title'],most_book.index, palette='icefire_r')
plt.yticks(fontsize=15)

for i in ax.patches:
    ax.text(i.get_width()+.3, i.get_y()+0.5, str(round(i.get_width())), fontsize = 20, color = 'k')

## Getting overall performance for an author over time
---
- First, create functions to get the information of books from isbn

In [None]:
# Creating a function to get book details from the ISBN 13 value.

#Alternate scraping solution, when both the API(s) fails
# Creating a function to get book details from the ISBN 13 value.

#Alternate scraping solution, when both the API(s) fails
def html(isbn):
    url = "".join(['https://isbndb.com/book/',str(isbn)])
    article = Article(url)
    #article = 'https://isbndb.com/book/9780450524684'
    article.download()
    article.parse()
    ar = article.html
    ar = ar[9300:9900]
    return ar

def reg(l):
    return re.search(r'(\b\d{4})\b',l).group()[0]
    
#Gathering the data for the year column for the books from their ISBN 13 values
def bookdata(df):
    year=[]
    pbar = ProgressBar()
    for isbn in pbar(df.isbn13):
        try:
            details = isbnlib.meta(isbn)
            year.append(details['Year'])
        except :
            #Trying out with goodreads api now
            try: 
                book_detail = client.Book.show_by_isbn(isbn)
                keys_wanted = ['publication_year']
                reduced_book = {k:v for k,v in book_detail.items() if k in keys_wanted}
                year.append((reduced_book['publication_year']))
            
            except: 
                #Going with webscraping
                try:
                    y = html(isbn)
                    year_extracted = reg(y) #Extracting year with regex
                    year.append(y)
                except:
                    year.append('0')
                
    return year

def plot_author_chart(author_df):
    year = bookdata(author_df)
    author_df = final_df(author_df, year)
    author_df.dropna(0, inplace=True)
    author_df = author_df[author_df['Year'].str.isnumeric()]
    author_df = author_df.set_index('title')
    author_df = author_df[author_df.Year !='0']
    plt.figure(figsize=(15,15))
    sns.set_context('talk')
    plt.xticks(rotation=30)
    ax =  sns.barplot(author_df['Year'], author_df['average_rating'], palette='deep')
    ax.set_title("Average rating of books over time, "+ author_df.authors[1])
    plt.xticks(rotation=30)
    return ax



# The finction for getting the final dataframe for the charts
def final_df(df1, l):
    year_df = pd.DataFrame(l, columns=['Year'])
    df1 = df1.reset_index(drop=True)
    final = df1[['authors', 'average_rating', 'title']].join(year_df)
    return final

- Second to get the sunset of authors 

In [None]:
author_count=df['authors'].value_counts()[:10].reset_index().rename(columns={'index':"author","authors":"count"})
#Let's test the top 4 authors and "J.K. Rowling" performanc
cc = list(author_count['author'][:4])
top_4.append("J.K. Rowling")



In [None]:
pbar = ProgressBar()
# pbar(df.isbn13[0])
for i in pbar(df.isbn13):
  print(i)
  isbnlib.meta(i)

Ranging from authors spanning a few decades with their titles to some of the current favourites, let's see how they fared over time.



In [None]:
import string
pbar = ProgressBar()
for isbn in pbar(author1.isbn13):
  print(isbn)
  # html(isbn)
  url = "".join(['https://isbndb.com/book/',str(isbn)])
  article = Article(url)
  article.download()
  article.parse()
  ar = article.html
  ar = ar[9300:9900]
  # print(reg(ar))
  print(ar)


In [None]:
df[(df['language_code']=='eng') & (df['authors']==top_4[0])]

In [None]:
# For the sake of brevity, only english language has been used.
author_df = df[(df['authors']==top_4[0]) & (df['language_code']=='eng')]
# author_df = author_df[author_df['language_code']=='eng']
plot_author_chart(author_df)

# author1 = df[(df['authors'] == top_4[0]) & (df['language_code']=='eng')]
# plot_author_chart(author1)

In [None]:
author_df = df[df['authors']==top_4[0]]
author_df = author_df[author_df['language_code']=='eng']
# plot_author_chart(author_df)

In [None]:
year = bookdata(author_df)
author_df = final_df(author_df, year)
author_df.dropna(0, inplace=True)
author_df = author_df[author_df['Year'].str.isnumeric()]
author_df = author_df.set_index('title')
author_df = author_df[author_df.Year !='0']
plt.figure(figsize=(15,15))
sns.set_context('talk')
plt.xticks(rotation=30)
# ax =  sns.barplot(author_df['Year'], author_df['average_rating'], palette='deep')
# ax.set_title("Average rating of books over time, "+ author_df.authors[1])
# plt.xticks(rotation=30)



# Topic Modelling
---
KMeans Clustering without outliers
The goal is toto finnd groups in data.
With this, I attemp to find a relationship or groups between the rating count and average value.

In [None]:
# trial = df[['average_rating','ratings_count']]
# data = np.asarray([np.asarray(trial['average_rating']),np.asarray(trial['ratings_count'])]).T
trial = df[['average_rating', 'ratings_count']]
data = np.asarray([np.asarray(trial['average_rating']), np.asarray(trial['ratings_count'])]).T
# data = np.asarray(trial)


In [None]:
X = data
distortions = []
for k in range(2,30):
  k_means = KMeans(n_clusters=k)
  k_means.fit(X)
  distortions.append(k_means.inertia_)

In [None]:
fig = plt.figure(figsize=(15,10))
plt.plot(range(2,30),distortions,'bx-')
# plt.ylim((0,7))
plt.title('Elbow Curve')

In [None]:
centroids, _ = kmeans(data,5)
idx, _ = vq(data, centroids)
centroids

In [None]:
type(data)

In [None]:
# some plotting using numpy's logical indexing
sns.set_context('paper')
plt.figure(figsize=(20,20))
plt.plot(
    data[idx==0,0],data[idx==0,1],'or',
    data[idx==1,0],data[idx==1,1],'ob',#blue circles
    data[idx==2,0],data[idx==2,1],'oy', #yellow circles
    data[idx==3,0],data[idx==3,1],'om', #magenta circles
    data[idx==4,0],data[idx==4,1],'ok',#black circles
     )

In [None]:
# some plotting using numpy's logical indexing
sns.set_context('paper')
plt.figure(figsize=(15,10))
plt.plot(data[idx==0,0],data[idx==0,1],'or',#red circles
     data[idx==1,0],data[idx==1,1],'ob',#blue circles
     data[idx==2,0],data[idx==2,1],'oy', #yellow circles
     data[idx==3,0],data[idx==3,1],'om', #magenta circles
     data[idx==4,0],data[idx==4,1],'ok',#black circles
    
     
        
        
        
        
        )
plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=8, )




circle1 = Line2D(range(1), range(1), color = 'red', linewidth = 0, marker= 'o', markerfacecolor='red')
circle2 = Line2D(range(1), range(1), color = 'blue', linewidth = 0,marker= 'o', markerfacecolor='blue')
circle3 = Line2D(range(1), range(1), color = 'yellow',linewidth=0,  marker= 'o', markerfacecolor='yellow')
circle4 = Line2D(range(1), range(1), color = 'magenta', linewidth=0,marker= 'o', markerfacecolor='magenta')
circle5 = Line2D(range(1), range(1), color = 'black', linewidth = 0,marker= 'o', markerfacecolor='black')

plt.legend((circle1, circle2, circle3, circle4, circle5)
           , ('Cluster 1','Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5'), numpoints = 1, loc = 0, )


plt.show()