In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split 

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', dtype={'star_rating': float})

df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating',
              'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

print(df['star_rating'])

0         4.0
1         3.0
2         1.0
3         5.0
4         4.0
         ... 
101831    2.0
101832    4.0
101833    2.0
101834    3.0
101835    1.0
Name: star_rating, Length: 101836, dtype: float64


In [65]:
def apply_pivot(df,fillby = None):
    if fillby is not None:
        return df.pivot_table(index='customer_id', columns='product_id',values='star_rating').fillna(fillby)
    return df.pivot_table(index='customer_id', columns='product_id', values='star_rating')

# Train-test split (possivelmente deviamos por no training os mais antigos para prever o futuro)
train, test = train_test_split(df, test_size=0.30, random_state=42)
test = test[test.customer_id.isin(train.customer_id)]

# Check for duplicate customer_if - product_id pairs
#print(train.duplicated(['customer_id', 'product_id']).sum())

# Eliminate duplicates
train = train.drop_duplicates(['customer_id', 'product_id'])

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

# Para ver se existem valores não NaN
#print(df_train_pivot.count())
#print(df_train_pivot.isna().sum())
#print(df_train_pivot)

# Train
dummy_train = train.copy()

# Replace non-numeric values in 'star_rating' column with NaN (possivelmente podia-se substituir pela média ou assim e não por 0)
dummy_train['star_rating'] = pd.to_numeric(dummy_train['star_rating'], errors='coerce')

# Exclude products already rated by the user
dummy_train['star_rating'] = dummy_train['star_rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)

# Test
dummy_test = test.copy()
dummy_test['star_rating'] = dummy_test['star_rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

print(dummy_train.describe())


product_id    B000JLNHO6    B000YMNI2Q    B000YMNI76    B000YMR5X4  \
count       67154.000000  67154.000000  67154.000000  67154.000000   
mean            0.999985      0.999955      0.999985      0.999926   
std             0.003859      0.006684      0.003859      0.008629   
min             0.000000      0.000000      0.000000      0.000000   
25%             1.000000      1.000000      1.000000      1.000000   
50%             1.000000      1.000000      1.000000      1.000000   
75%             1.000000      1.000000      1.000000      1.000000   
max             1.000000      1.000000      1.000000      1.000000   

product_id    B000YMR61A    B000YMR6AG    B000YMRM8W    B00194DS1Y  \
count       67154.000000  67154.000000  67154.000000  67154.000000   
mean            0.999687      0.999970      0.999970      0.999985   
std             0.017681      0.005457      0.005457      0.003859   
min             0.000000      0.000000      0.000000      0.000000   
25%             1.0