# Popular Books DF extraction

In [1]:
import numpy as np
import pandas as pd
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
books_df = pd.read_csv(r'../artifacts/data_ingestion/books.csv')
ratings_df = pd.read_csv(r'../artifacts/data_ingestion/ratings.csv')
users_df = pd.read_csv(r'../artifacts/data_ingestion/users.csv')

In [4]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [5]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [6]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


### Which book is getting how many ratings?

In [7]:
total_ratings = ratings_df.groupby('ISBN').count()['Book-Rating'].reset_index()

In [8]:
total_ratings.head()

Unnamed: 0,ISBN,Book-Rating
0,904492401X,1
1,#069580216X,1
2,#6612432,1
3,'9607092856',1
4,'9607092910',1


### Merging rating counts of each book in books_df dataframe

In [9]:
books_with_rating = books_df.merge(total_ratings,on='ISBN', how='inner')

In [10]:
books_with_rating.shape

(270152, 7)

In [11]:
books_with_rating.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,14
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,3
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,11
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,1


### Calulating avg rating count of each book by its Title

In [12]:
avg_rating_count = books_with_rating.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_count.rename(columns={'Book-Rating':'avg_rating'},inplace=True)

In [13]:
avg_rating_count.head()

Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4.0
1,Always Have Popsicles,1.0
2,Apple Magic (The Collector's series),1.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1.0
4,Beyond IBM: Leadership Marketing and Finance ...,1.0


### Merging avg_rating_count with individual rating count dataframe

In [14]:
df = books_with_rating.merge(avg_rating_count, on='Book-Title', how='inner')

In [15]:
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Book-Rating,avg_rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,1.0
1,801319536,Classical Mythology,Mark P. O. Morford,1998,John Wiley &amp; Sons,http://images.amazon.com/images/P/0801319536.0...,1,1.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,14,14.0
3,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,3,3.0
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,11,11.0


In [16]:
df.shape

(270152, 8)

### Extracting popular books which gets high ratings

In [28]:
popular_books = df[df['Book-Rating']>=10].sort_values('avg_rating',ascending=False).head(50)

In [29]:
popular_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Book-Rating,avg_rating
41,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,2504,2504.0
850,316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,1295,1295.0
1050,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,http://images.amazon.com/images/P/0312195516.0...,724,724.0
1355,446672211,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,1998,Warner Books,http://images.amazon.com/images/P/0446672211.0...,585,585.0
8206,316601950,The Pilot's Wife : A Novel,Anita Shreve,1999,Back Bay Books,http://images.amazon.com/images/P/0316601950.0...,568,568.0


In [31]:
popular_books.shape

(50, 8)