In [None]:
#this is our dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
#use pandas since we are working with data
import pandas as pd

In [None]:
#read the path into pd so we can use it
books = pd.read_csv(f"{path}/books.csv")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#shows us the missing information to see if there is a pattern
ax = plt.axes()
sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)

plt.xlabel("Columns")
plt.ylabel("Missing values")

In [None]:
import numpy as np

books["missing_description"] = np.where(books["description"].isna(), 1, 0)
books["age_of_book"] = 2024 - books["published_year"]

In [None]:
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]

correlation_matrix = books[columns_of_interest].corr(method="spearman")

sns.set_theme(style="white")
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt = ".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"})
heatmap.set_title("Correlation heatmap")
plt.show()

In [None]:
#take all the books that have these attributes
book_missing = books[~(books["description"].isna()) &
       ~(books["num_pages"].isna()) &
    ~(books["average_rating"].isna()) &
    ~(books["published_year"].isna())
]

In [None]:
book_missing

In [None]:
#The categories that are the most popular
book_missing["categories"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
#get rid of descriptions that are too small

#split the string and then get the length so u get the length of description
book_missing["words_in_description"] = book_missing["description"].str.split().str.len()

In [None]:
#shows all the descriptions between 25-45 words
book_missing.loc[book_missing["words_in_description"].between(25,45),"description"]

In [None]:
#new set of books that all have more than 25 words.
book_missing_20_words = book_missing[book_missing["words_in_description"] >= 25]

In [None]:
#new field for title and subtitle and combine the two if they exist
book_missing_20_words["title_and_subtitle"] = (
    np.where(book_missing_20_words["subtitle"].isna(), book_missing_20_words["title"], book_missing_20_words[["title","subtitle"]].astype(str).agg(": ".join, axis=1))
)

In [None]:
book_missing_20_words["tagged_description"] = book_missing_20_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)

In [None]:
book_missing_20_words

In [None]:
#save the books we are going to use and drop the categories we don't need.
(
    book_missing_20_words.drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1).to_csv("books_cleaned.csv", index=False)
)