In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BOOK_DATA = Path('.') / 'RankingData' / 'GoodBooks10k' / 'books.csv'
BOOK_REVIEWS = Path('.') / '.datasets' / 'amazon_reviews_us_Books_v1_01.tsv'


%matplotlib inline

In [None]:
books = pd.read_csv(BOOK_DATA)
books.sort_values('average_rating')

In [None]:
books.columns

In [None]:
amzn = pd.read_csv(BOOK_REVIEWS, delimiter='\t', error_bad_lines=False)
amzn

In [None]:
amzn['star_rating'] = pd.to_numeric(amzn.star_rating, errors='coerce')
amzn.dropna(subset=['star_rating'], inplace=True)

In [None]:
amzn_product_titles = amzn.product_title.unique()
matching_titles = books[books.title.isin(amzn_product_titles)]

In [None]:
plt.hist(matching_titles.average_rating)
plt.show()

In [None]:
matching_titles.ratings_count.describe()

In [None]:
matching_titles['percentile_rank'] = matching_titles.average_rating.rank(pct=True)
matching_titles

In [None]:
matching_titles.columns

In [None]:
amzn_df = amzn[amzn.product_title.isin(matching_titles.title)].groupby('product_title').agg(['mean', 'count'])

In [None]:
def remove_column_hierarchy(tier_df):
    tier_df = tier_df.copy()
    tier_df.reset_index(inplace=True)
    tier_df.columns = ["_".join(c).strip('_') for c in tier_df.columns]
    return tier_df

def add_target(tier_df, target_value, tier):
    tier_df = tier_df.copy()
    tier_df['target'] = target_value
    tier_df['tier'] = tier
    return tier_df

pd.merge(remove_column_hierarchy(amzn_df), matching_titles, left_on=['product_title'], right_on=['title']).sort_values('percentile_rank')