# Importing the dependencies and Importing the dataset in a data frame:
## Then splitting the entries that have Harry Potter at the beggining of the title into a different variable.

In [82]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

df = pd.read_csv('books.csv')
df = df.drop_duplicates()
df = df.dropna()

HarryPotter_Books = df[df['original_title'].str.startswith('Harry Potter')]
print("All Harry Potter books:\n")
print(HarryPotter_Books)

All Harry Potter books:

     book_id  goodreads_book_id  best_book_id   work_id  books_count  \
1          2                  3             3   4640799          491   
6         18                  5             5   2402163          376   
8         21                  2             2   2809203          307   
9         23              15881         15881   6231171          398   
10        24                  6             6   3046572          332   
11        25             136251        136251   2963218          263   
12        27                  1             1  41335427          275   
613     3753                 10            10  21457570            6   

           isbn        isbn13                                  authors  \
1     439554934  9.780440e+12              J.K. Rowling, Mary GrandPré   
6    043965548X  9.780440e+12  J.K. Rowling, Mary GrandPré, Rufus Beck   
8     439358078  9.780439e+12              J.K. Rowling, Mary GrandPré   
9     439064864  9.780439e+12 

# Getting the most selling Harry Potter Book based on the number of work ratings:

In [83]:
mostSelling_index = HarryPotter_Books['work_ratings_count'].idxmax()
mostSelling = HarryPotter_Books.loc[mostSelling_index]
most_selling_text = f"Most selling Harry Potter book: {mostSelling['original_title']}\nNumber of sales: {mostSelling['work_ratings_count']}"
print(most_selling_text)      

Most selling Harry Potter book: Harry Potter and the Philosopher's Stone
Number of sales: 4800065


# Calculating the average rating of the Harry Potter books in the dataset:

In [84]:
average_rating_of_HarryPotter_books = HarryPotter_Books['average_rating'].mean()
average_rating_text = f"Average rating of the Harry Potter Books: {average_rating_of_HarryPotter_books}"
print(average_rating_text)

Average rating of the Harry Potter Books: 4.52625


# Creating the Plots and saving them to the PDF

In [88]:
with PdfPages('HarryPotter_Books_Data_Analysis.pdf') as pdf:
    plt.figure(figsize=(16,9))
    plt.hist(HarryPotter_Books['average_rating'], bins=25, color='skyblue', edgecolor='black')
    plt.xlabel('Average Rating')
    plt.ylabel('Frequency')
    plt.title('Histogram of Ratings for Harry Potter Books')
    plt.text(0.05, 1.08, average_rating_text, size=16, transform=plt.gca().transAxes, verticalalignment='top')
    plt.grid(True)
    plt.grid(True)
    pdf.savefig()
    plt.close()
    plt.figure(figsize=(16, 9))
    plt.bar(HarryPotter_Books.original_title, HarryPotter_Books['work_ratings_count'])
    plt.xticks(rotation=10)
    plt.title('Sales for Harry Potter Books')
    plt.xlabel('Index')
    plt.ylabel('Number of work ratings')
    plt.text(0.05, 1.08, most_selling_text, size=16, transform=plt.gca().transAxes, verticalalignment='top')
    plt.grid(True)
    pdf.savefig()  
    plt.close()

print("PDF saved successfully")

PDF saved successfully
