In [1]:
import pandas as pd
import csv

# Load the dataset
file_path = "Books_Data.csv"

# Open the file to inspect the delimiter
with open(file_path, 'r', encoding='utf-8') as f:
    sample = f.read(500)  # Read a portion of the file

# Detect the delimiter
sniffer = csv.Sniffer()
delimiter = sniffer.sniff(sample).delimiter

# Read the CSV using the detected delimiter
df = pd.read_csv(file_path, delimiter=delimiter)

# Standardize column names (convert to snake_case)
df.columns = (df.columns.str.strip()  # Remove leading/trailing spaces
              .str.lower()  # Convert to lowercase
              .str.replace(" ", "_")  # Replace spaces with underscores
              .str.replace("-", "_"))  # Replace dashes with underscores

# Fix the "publishing_year" column: Convert to integer and filter valid years
df['publishing_year'] = pd.to_numeric(df['publishing_year'], errors='coerce').astype('Int64')
df['publishing_year'] = df['publishing_year'].fillna(0).astype(int)
df = df[(df['publishing_year'] >= 1000) & (df['publishing_year'] <= 2025)]  # Reasonable year range

# Standardize "author_rating" column
rating_map = {'Novice': 'Beginner', 'Intermediate': 'Intermediate', 'Expert': 'Expert'}
df['author_rating'] = df['author_rating'].map(rating_map)

# Standardize "genre" column
genre_map = {
    'genre fiction': 'Fiction',
    'fiction': 'Fiction',
    'nonfiction': 'Non-Fiction',
    'children books': 'Children'
}
df['genre'] = df['genre'].map(genre_map).fillna('Fiction')  # Default to Fiction if unclear

# Compute "operating_cost" (gross sales - publisher revenue)
df['operating_cost'] = df['gross_sales'] - df['publisher_revenue']

# gross sales
gross_sales_mean = df['gross_sales'].mean()
gross_sales_median = df['gross_sales'].median()
gross_sales_std = df['gross_sales'].std()

# Compute average book rating for each publisher
publisher_avg_rating = df.groupby('publisher')['book_average_rating'].mean().reset_index()

# Remove the "sales_rank" column
df.drop(columns=['sales_rank'], inplace=True)

# Extract top 10 books with ratings >= 4
top_rated_books = df[df['book_average_rating'] >= 4].nlargest(10, 'book_average_rating')

# Display cleaned dataset preview and top-rated books
print(df.head())
print(top_rated_books)
print(publisher_avg_rating)


   index  publishing_year                        book_name  \
0      0             1975                          Beowulf   
1      1             1987                 Batman: Year One   
2      2             2015                Go Set a Watchman   
3      3             2008  When You Are Engulfed in Flames   
4      4             2011         Daughter of Smoke & Bone   

                                              author language_code  \
0                             Unknown, Seamus Heaney         en-US   
1  Frank Miller, David Mazzucchelli, Richmond Lew...           eng   
2                                         Harper Lee           eng   
3                                      David Sedaris         en-US   
4                                       Laini Taylor           eng   

  author_rating  book_average_rating  book_ratings_count    genre  \
0      Beginner                 3.42            155903.0  Fiction   
1  Intermediate                 4.23            145267.0  Fiction   