In [None]:
import numpy as np
import pandas as pd
books = pd.read_csv('Books.csv')
users = pd.read_csv('Users.csv')
ratings = pd.read_csv('Ratings.csv')

In [None]:
# books.head()
book_list = books[['Book-Title']]
books.head()
# list(books['Book-Title'].values)[2]


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [None]:
users.head()


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [None]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [None]:
# total rows in the data
print(books.shape)
print(users.shape)
print(ratings.shape)

(43354, 8)
(278858, 3)
(596611, 3)


In [None]:
# missing data
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64

In [None]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [None]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    1
dtype: int64

In [None]:
# books['Book-Title'].duplicated().sum()
books.duplicated().sum()

0

In [None]:
ratings.duplicated().sum()

0

In [None]:
users.duplicated().sum()

0

In [None]:
# merge books and ratings based on ISBN column
ratings_with_name = ratings.merge(books, on='ISBN')

In [None]:
# group by the number of ratings for each book
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Beyond IBM: Leadership Marketing and Finance ...,1
2,Final Fantasy Anthology: Official Strategy Gu...,4
3,It Takes Two,2
4,"Q-Space (Star Trek The Next Generation, Book 47)",7
...,...,...
40105,why I'm like this : True Stories,9
40106,Â¿QuiÃ©n se ha llevado mi queso?,1
40107,"Â¿QuÃ© me quieres, amor?",5
40108,Ã?Â?ber die Freiheit.,1


In [None]:
print("Data types before conversion:\n", ratings_with_name.dtypes)

# Identify non-numeric values in 'Book-Rating'
non_numeric_ratings = ratings_with_name[pd.to_numeric(ratings_with_name['Book-Rating'], errors='coerce').isna()]
print("Non-numeric values in 'Book-Rating':\n", non_numeric_ratings)

# Convert 'Book-Rating' column to numeric, forcing non-numeric values to NaN
ratings_with_name['Book-Rating'] = pd.to_numeric(ratings_with_name['Book-Rating'], errors='coerce')

# Drop rows with NaN values in 'Book-Rating' column
ratings_with_name.dropna(subset=['Book-Rating'], inplace=True)

# Ensure that 'Book-Rating' column is of type float
ratings_with_name['Book-Rating'] = ratings_with_name['Book-Rating'].astype(float)

# Print data types after conversion
print("Data types after conversion:\n", ratings_with_name.dtypes)

# Group by 'Book-Title' and calculate the mean of 'Book-Rating'
avg_rating_df = ratings_with_name.groupby('Book-Title').agg({'Book-Rating': 'mean'}).reset_index()

# Rename the 'Book-Rating' column to 'avg_ratings'
avg_rating_df.rename(columns={'Book-Rating': 'avg_ratings'}, inplace=True)

# Display the resulting DataFrame
print(avg_rating_df)

Data types before conversion:
 User-ID                  int64
ISBN                    object
Book-Rating            float64
Book-Title              object
Book-Author             object
Year-Of-Publication      int64
Publisher               object
Image-URL-S             object
Image-URL-M             object
Image-URL-L             object
dtype: object
Non-numeric values in 'Book-Rating':
 Empty DataFrame
Columns: [User-ID, ISBN, Book-Rating, Book-Title, Book-Author, Year-Of-Publication, Publisher, Image-URL-S, Image-URL-M, Image-URL-L]
Index: []
Data types after conversion:
 User-ID                  int64
ISBN                    object
Book-Rating            float64
Book-Title              object
Book-Author             object
Year-Of-Publication      int64
Publisher               object
Image-URL-S             object
Image-URL-M             object
Image-URL-L             object
dtype: object
                                              Book-Title  avg_ratings
0       A Light in the 

In [None]:
# merge num_rating_df and avg_rating_df
popularity_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popularity_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
2,Final Fantasy Anthology: Official Strategy Gu...,4,5.000000
3,It Takes Two,2,0.000000
4,"Q-Space (Star Trek The Next Generation, Book 47)",7,0.000000
...,...,...,...
40105,why I'm like this : True Stories,9,4.555556
40106,Â¿QuiÃ©n se ha llevado mi queso?,1,7.000000
40107,"Â¿QuÃ© me quieres, amor?",5,1.200000
40108,Ã?Â?ber die Freiheit.,1,7.000000


In [None]:
# only keep the books whose number of ratings are more than 250
popularity_df[popularity_df['num_ratings']>=250].sort_values('avg_ratings', ascending=False)

Unnamed: 0,Book-Title,num_ratings,avg_ratings
13119,Harry Potter and the Chamber of Secrets (Book 2),301,5.039867
13130,Harry Potter and the Sorcerer's Stone (Harry P...,295,4.986441
36772,To Kill a Mockingbird,267,4.734082
34548,The Secret Life of Bees,414,4.601449
30239,The Da Vinci Code,473,4.511628
32685,The Lovely Bones: A Novel,668,4.407186
38943,Where the Heart Is (Oprah's Book Club (Paperba...,316,4.39557
12482,Good in Bed,264,4.333333
34136,The Red Tent (Bestselling Backlist),374,4.312834
12239,Girl with a Pearl Earring,275,4.203636
