In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sklearn

In [2]:
books_big = pd.read_csv('data/books2.csv', sep=',', header=0, low_memory=False)
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0)
users = pd.read_csv('data/users.csv', sep=',', header=0)

EmptyDataError: No columns to parse from file

In [None]:
books_big.head()

In [None]:
books_big.columns = books_big.columns.str.lower().str.replace('-', '_')
books_big.head()

In [None]:
users.columns = users.columns.str.lower().str.replace('-', '_')
users.head()

In [None]:
ratings.columns = ratings.columns.str.lower().str.replace('-', '_')
ratings.head()

In [None]:
ratings['book_rating'].value_counts()

In [None]:
print(books_big.shape)
print(users.shape)
print(ratings.shape)

In [None]:
books_big.isnull().sum()

In [None]:
books_big.duplicated().sum()

In [None]:
users.isnull().sum()

In [None]:
# Excluding missings in age?
# Calculate the total number of rows
total_rows = len(users)

# Calculate the number of rows with missing values in the 'age' column
missing_age_rows = users['age'].isna().sum()

# Calculate the percentage of rows with missing values in the 'age' column
percentage_missing = (missing_age_rows / total_rows) * 100

print(f"Percentage of data that would be excluded: {percentage_missing:.2f}%")

# too many missing values, so we will not exclude them

In [None]:
users.drop('age', axis=1, inplace=True)
users.head()

In [None]:
users.shape

In [None]:
users.duplicated().sum()

In [None]:
ratings.isnull().sum()

In [None]:
ratings.duplicated().sum()

#### For these two boks no authors can be found https://www.amazon.co.uk/Quiz-Masters-Earth-None/dp/0751352497, https://www.amazon.co.uk/Credit-Suisse-Managing-Personal-Wealth/dp/9627982032/ref=sr_1_1?crid=1ZD3GAL212RZG&dib=eyJ2IjoiMSJ9.UgZXUfNuNIEGkJXoAHTOAw.rcIwc5TfpSJLdjSRThEQIELLHu0EXsupGqO51xk30Hg&dib_tag=se&keywords=9627982032&qid=1724760906&s=books&sprefix=9627982032%2Cstripbooks%2C92&sr=1-1

In [None]:
print(books_big[books_big['book_author'].isnull()])

#### Replacing missing publishers: Tyrant Moon: publisher is NovelBooks, Finders Keepers: publisher is NovelBooks

In [None]:
print(books_big[books_big['publisher'].isnull()])

In [None]:
# Define the specific ISBN numbers and the replacement publisher name
isbn_to_replace = ['193169656X', '1931696993']  # Replace with actual ISBN numbers
replacement_publisher = 'NovelBooks'

# Replace the publisher name
books_big.loc[books_big['isbn'].isin(isbn_to_replace), 'publisher'] = replacement_publisher

print(books_big[books_big['isbn'].isin(isbn_to_replace)])

In [None]:
# Count the occurrences of each author and sort them in descending order
author_counts = books_big['book_author'].value_counts().head(50)

# Plot the top 25 authors with the highest number of books
author_counts.plot(kind='barh', figsize=(12, 10))
plt.xlabel('Number of Books')
plt.ylabel('Book Author')
plt.title('Top 50 Authors with the Highest Number of Books')
plt.show()

In [None]:
# Count the occurrences of each publisher and sort them in descending order
publication_counts = books_big['publisher'].value_counts().head(50)

# Plot the top 25 authors with the highest number of books
publication_counts.plot(kind='barh', figsize=(12, 10))
plt.xlabel('Number of Books')
plt.ylabel('Publisher')
plt.title('Top 50 Publishers with the Highest Number of Books')
plt.show()

In [None]:
# Publications per year

books_big[books_big['year_of_publication']=='0'].describe()

In [None]:
books_big['year_of_publication'].unique()

In [None]:
years_to_ckeck = ['0','DK Publishing Inc', 'Gallimard', '1378', '1919', '1922', '1897', '2024', '1376', '2037']   

In [None]:
books_big.loc[books_big['year_of_publication'].isin(years_to_ckeck)]

In [None]:
#plotting books per year
books_big['year_of_publication'].value_counts().plot(kind='bar', figsize=(12, 16))

In [None]:
# what should we do with all the '0' years?

In [None]:
#Average book ratings

# Merge ratings and books_big on 'isbn'
bookRating = pd.merge(ratings, books_big, on="isbn")

# Drop unnecessary columns
bookRating.drop(columns=['image_url_s', 'image_url_m', 'image_url_l'], inplace=True)

# Calculate the average rating for each book
averageRating = bookRating.groupby('isbn')['book_rating'].mean().round(1).reset_index()
averageRating.rename(columns={'book_rating': 'average_rating'}, inplace=True)

# Merge the average ratings back with the original dataset
averageRatingdf = pd.merge(bookRating, averageRating, on='isbn')

# Remove duplicate entries
averageRatingUnique = averageRatingdf[['isbn', 'average_rating']].drop_duplicates(subset=['isbn'])

# Merge the cleaned dataset with the original books dataset
ratingBooks = pd.merge(books_big, averageRatingUnique, on='isbn', how='inner')

# Optional: If you need another DataFrame with books and their average ratings
books_with_rating = pd.merge(books_big, averageRatingUnique, on='isbn')

In [None]:
books_with_rating.head()

In [None]:
books_with_rating[books_with_rating['average_rating']==10.0].describe()

In [None]:
#plot number of books with their average ratings

books_with_rating['average_rating'].value_counts().plot(kind='bar', figsize=(12, 16))

In [None]:
import matplotlib.pyplot as plt

# Get the value counts and sort them by index (rating)
rating_counts = books_with_rating['average_rating'].value_counts().sort_index()

# Plot the sorted value counts with broader bars and additional customizations
rating_counts.plot(
    kind='bar', 
    figsize=(16, 8),  # Adjust the figure size for better spacing
    width=0.99,  # Adjust the width of the bars for better spacing
    color='skyblue'  # Customize the color of the bars
)

# Add title and labels
plt.title('Distribution of Average Book Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')

# Set the x-axis labels to be the sorted ratings
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability

# Show the plot
plt.show()

In [None]:
# plot book titles with highest ratings
books_with_rating.sort_values('average_rating', ascending=False).head(50).plot(kind='barh', x='book_title', y='average_rating', figsize=(12, 10))

In [None]:
small = pd.read_csv('data/books.csv', sep=',', header=0, low_memory=False)  

In [None]:
small.head()

In [None]:
small.shape