# Data Preparation

In [88]:
import flask
import pandas as pd
import numpy as np
import gunicorn
import requests

In [89]:
book = pd.read_csv('data/books_c.csv')
book.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.55,043965548X,9780439655484,eng,435,2149872,33964


In [90]:
book.shape

(13714, 10)

In [91]:
book.isna().sum()

bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
# num_pages           0
ratings_count         0
text_reviews_count    0
dtype: int64

In [92]:
book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13714 entries, 0 to 13713
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              13714 non-null  int64  
 1   title               13714 non-null  object 
 2   authors             13714 non-null  object 
 3   average_rating      13714 non-null  float64
 4   isbn                13714 non-null  object 
 5   isbn13              13714 non-null  int64  
 6   language_code       13714 non-null  object 
 7   # num_pages         13714 non-null  int64  
 8   ratings_count       13714 non-null  int64  
 9   text_reviews_count  13714 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 1.0+ MB


## Method List

### List Author
Menampilkan list author unique yang berada di books_c.csv <br>

In [93]:
author_u = book.authors.unique()
author_df = pd.DataFrame(author_u)
author_df.columns = ['author']
author_df

Unnamed: 0,author
0,J.K. Rowling
1,W. Frederick Zimmerman
2,Douglas Adams
3,Douglas Adams-Stephen Fry
4,Bill Bryson-William Roberts
...,...
7594,Alisa Kwitney-Neil Gaiman
7595,Neil Gaiman-Teddy Kristiansen
7596,Neil Gaiman-Michael Reaves
7597,Brian Froud-Jessica Macbeth


In [94]:
author_df.count()

author    7599
dtype: int64

### Most popular author
10 Author dilihat dari author yang dirating user terbanyak

In [100]:
popular_author = book.groupby(['authors'])['ratings_count'].sum().reset_index().sort_values(by=['ratings_count'],ascending=False)
pop_author = popular_author.drop(columns='ratings_count')
pop_author.head(10)

Unnamed: 0,authors
3018,J.K. Rowling
3039,J.R.R. Tolkien
1276,Dan Brown
6695,Stephenie Meyer
6628,Stephen King
5289,Nicholas Sparks
3009,J.D. Salinger
3708,John Steinbeck
5996,Rick Riordan
6014,Roald Dahl-Quentin Blake


### Average Author Score

Cek nilai author rata-rata

In [97]:
avg_author = book.groupby(['authors'])['average_rating'].mean().reset_index()
avg_author

Unnamed: 0,authors,average_rating
0,A.B. Yehoshua-Hillel Halkin,3.585
1,A.D.P. Briggs-Leo Tolstoy-Fyodor Dostoyevsky,3.760
2,A.E. Cunningham-Harlan Ellison-Charles F. Mill...,4.150
3,A.J. Jacobs,3.770
4,A.M. Homes,3.455
...,...,...
7594,Émile Zola-Henri Mitterand,4.050
7595,Émile Zola-Robert Lethbridge-Elinor Dorday,3.990
7596,Émile Zola-Robin Buss-Brian Nelson,3.990
7597,Émile Zola-Roger Pearson,4.040


### Most Popular Author Mean Score
skor rata-rata top 10 author dengan melihat jumlah rating yang paling banyak

In [98]:
top10_score = popular_author.merge(avg_author,on='authors',how='left')
top10_score.head(10)

Unnamed: 0,authors,ratings_count,average_rating
0,J.K. Rowling,13954740,4.517826
1,J.R.R. Tolkien,5044183,4.400645
2,Dan Brown,4416542,3.796
3,Stephenie Meyer,4369733,3.59
4,Stephen King,4342510,4.006364
5,Nicholas Sparks,3149166,3.994737
6,J.D. Salinger,2653050,4.013333
7,John Steinbeck,2353640,4.024375
8,Rick Riordan,2255429,4.262
9,Roald Dahl-Quentin Blake,2210694,4.075


### Most popular book - RANKED
Judul buku yang paling populer dengan melihat jumlah rating dan skornya

In [146]:
popular_book = book.groupby(['title','authors','ratings_count'])['average_rating'].sum().reset_index().sort_values(by=['ratings_count','average_rating'],ascending=False)
p_book = popular_book.reset_index().reset_index()
p_book['rank'] = p_book['level_0'] + 1 #create new column 'rank'

#move 'rank' column to front
col_name='rank'
first_col = p_book.pop(col_name)
p_book.insert(0, col_name, first_col)

book_ranked = p_book.drop(columns =['level_0','index']) #drop column old index
book_ranked['rank'] = book_ranked['rank'].astype(str)

In [151]:
mask = book_ranked['rank'] == '1'

In [154]:
book_rank = book_ranked[mask]

In [155]:
book_rank

Unnamed: 0,rank,title,authors,ratings_count,average_rating
0,1,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,5629932,4.47


In [153]:
book_rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            1 non-null      object 
 1   title           1 non-null      object 
 2   authors         1 non-null      object 
 3   ratings_count   1 non-null      int64  
 4   average_rating  1 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 48.0+ bytes


### Most Popular Author - RANKED

In [165]:
    popular_author = book.groupby(['authors'])['ratings_count'].sum().reset_index().sort_values(by=['ratings_count'], ascending=False)
    avg_author = book.groupby(['authors'])['average_rating'].mean().reset_index()
    top10_score = popular_author.merge(avg_author, on='authors', how='left')
    p_author = top10_score.reset_index().reset_index()
    p_author['rank'] = p_author['level_0'] + 1  # create new column 'rank'

    # move 'rank' column to front
    col_name = 'rank'
    first_col = p_author.pop(col_name)
    p_author.insert(0, col_name, first_col)

    author_ranked = p_author.drop(columns=['level_0', 'index'])  # drop column old index
    author_ranked['rank'] = author_ranked['rank'].astype(str) #change int to string


In [166]:
mask = author_ranked['rank'] == '1'

In [168]:
author_ranked = author_ranked[mask]

In [169]:
author_ranked

Unnamed: 0,rank,authors,ratings_count,average_rating
0,1,J.K. Rowling,13954740,4.517826


In [170]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
