In [3]:
import pandas as pd

df = pd.read_csv("books.csv")

def clean_and_preprocess(df):
    df = df.drop_duplicates()
    df = df.dropna()

    df['original_publication_year'] = df['original_publication_year'].astype('Int64')
    df['isbn13'] = df['isbn13'].astype('Int64')

    df = df[(df['average_rating'] >= 0) & (df['average_rating'] <= 5)]

    valid_language_codes = ['eng', 'en-US', 'en-GB']
    df = df[df['language_code'].isin(valid_language_codes)]

    df = df.reset_index(drop=True)

    return df

cleaned_df = clean_and_preprocess(df)

harry_potter_books = cleaned_df[cleaned_df['title'].str.contains('Harry Potter', case=False)]

most_selling_hp_books = harry_potter_books.sort_values(by='ratings_count', ascending=False)

print("Most Selling Harry Potter Books:")
for i, title in enumerate(most_selling_hp_books['title'], start=1):
    print(i, title)

average_rating_hp = harry_potter_books['average_rating'].mean()
print(f"\nAverage rating of Harry Potter books: {average_rating_hp}")

# Export the cleaned DataFrame to a new CSV file
cleaned_df.to_csv("cleaned.csv", index=False)


Most Selling Harry Potter Books:
1 Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
2 Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
3 Harry Potter and the Chamber of Secrets (Harry Potter, #2)
4 Harry Potter and the Goblet of Fire (Harry Potter, #4)
5 Harry Potter and the Deathly Hallows (Harry Potter, #7)
6 Harry Potter and the Order of the Phoenix (Harry Potter, #5)
7 Harry Potter and the Half-Blood Prince (Harry Potter, #6)
8 Harry Potter Boxset (Harry Potter, #1-7)
9 Harry Potter Collection (Harry Potter, #1-6)
10 The Magical Worlds of Harry Potter: A Treasury of Myths, Legends, and Fascinating Facts

Average rating of Harry Potter books: 4.4910000000000005
