### About

This file is attempting to find the best way to fairly reduce the dataset so that it can be represented as a matrix in memory

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import cv2
%matplotlib inline

In [79]:
# takes a dataframe with beer_id as index and user_id as a label,
# returns a new dataframe grouped by beer_id with num_ratings column.
def get_beer_ratings_count(df):
    beer_group = df.groupby('beer_id', sort=True)
    # get the number of reviews per beer, make it a dataframe
    beer_num_reviews_df = beer_group.count().drop(columns=['user_score']).rename(columns={'user_id': 'num_reviews'})
    # sort beers by number of reviews
    beer_num_reviews_df = beer_num_reviews_df.sort_values(by=['num_reviews'])
    
    return beer_num_reviews_df.reset_index()   

In [21]:
# has beer names
df_orig = pd.read_csv('./Beer_Data/reduced_numeric_data.csv')

In [22]:
df_orig

Unnamed: 0,beer_id,user_score,user_id
0,18580,3.75,1
1,18570,4.25,1
2,18581,4.25,1
3,4200,4.25,1
4,1,4.50,1
...,...,...,...
4965676,19337,3.50,102604
4965677,19332,3.50,102604
4965678,19347,3.82,102605
4965679,20332,4.17,102605


In [23]:
# number of beers
len(set(df_orig['beer_id'].tolist()))

9999

In [35]:
# group data by users
user_group = df_orig.groupby('user_id')

In [36]:
# build a dataframe that shows the number of reviews each user has
user_num_reviews_df = user_group.count().drop(columns=['user_score']).rename(columns={'beer_id': 'num_reviews'})

In [37]:
# sort data by number of reviews
user_num_reviews_df = user_num_reviews_df.reset_index().sort_values(by=['num_reviews'])

In [49]:
# reset the index count for better looking data
user_num_reviews_df = user_num_reviews_df.reset_index(drop=True)

In [68]:
# 52707 users out of 101,574 users have more than 6 reviews (we will drop the rest)
users_2_keep = user_num_reviews_df[user_num_reviews_df['num_reviews'] > 5]['user_id'].tolist()
len(users_2_keep)

52707

In [69]:
# drop users with less than 6 reviews
df_reduced = df_orig[df_orig['user_id'].isin(users_2_keep)]

In [75]:
# there are still 4.8 million reviews, but they share a lot more users/beers
df_reduced

Unnamed: 0,beer_id,user_score,user_id
0,18580,3.75,1
1,18570,4.25,1
2,18581,4.25,1
3,4200,4.25,1
4,1,4.50,1
...,...,...,...
4964440,3583,4.25,101906
4964441,14654,4.00,101906
4964442,1106,3.40,101906
4964443,11819,4.00,101906


In [125]:
# write newly reduced dataframe to file
df_reduced.to_csv('./Beer_Data/reduced_data_X2.csv', index=False)

In [118]:
# get updated review count per beer, and import review count per beer before users were removed
reviews_per_beer_reducedX2 = get_beer_ratings_count(df_reduced)
reviews_per_beer_reduced = pd.read_csv('./Beer_Data/reduced_reviews_per_beer.csv')

In [121]:
# add old count to most-reduced df
reviews_per_beer_reducedX2['old_num_reviews'] = reviews_per_beer_reduced['num_reviews']
reviews_per_beer_reducedX2

Unnamed: 0,beer_id,num_reviews,old_num_reviews
0,5659,63,72
1,3078,64,72
2,18574,66,72
3,5367,66,72
4,17661,66,72
...,...,...,...
9994,1975,13200,14502
9995,18504,13293,14633
9996,2577,13889,14826
9997,6507,14838,16101


In [127]:
reviews_per_beer_reducedX2.to_csv('./Beer_Data/reduced_reviews_per_beer_X2.csv', index=False)