In [1]:
import gzip
import json
import dateutil.parser
import random
import numpy as np
from collections import defaultdict

In [2]:
import homework2

In [3]:
root = "./datasets/"

In [4]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [5]:
data = list(parseData(root + "/beer_50000.json"))

In [6]:
random.seed(0)
random.shuffle(data)

In [7]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [8]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [9]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [10]:
catID = dict(zip(list(categories),range(len(categories))))

In [11]:
catID

{'American Porter': 0,
 'Fruit / Vegetable Beer': 1,
 'English Pale Ale': 2,
 'Rauchbier': 3,
 'American Pale Ale (APA)': 4,
 'Scotch Ale / Wee Heavy': 5,
 'American IPA': 6,
 'Old Ale': 7,
 'American Double / Imperial IPA': 8,
 'American Double / Imperial Stout': 9,
 'Czech Pilsener': 10,
 'Rye Beer': 11,
 'Russian Imperial Stout': 12}

In [12]:
def testQ1():
    mod, validBER, testBER = homework2.Q1(catID, dataTrain, dataValid, dataTest)
    return validBER, testBER

In [13]:
testQ1()

(np.float64(0.16130237168160533), np.float64(0.16078380246088317))

In [14]:
def testQ2():
    mod, validBER, testBER = homework2.Q2(catID, dataTrain, dataValid, dataTest)
    return validBER, testBER

In [15]:
testQ2()

(np.float64(0.14955542095051438), np.float64(0.1504285600694365))

In [16]:
def testQ3():
    mod, validBER, testBER = homework2.Q3(catID, dataTrain, dataValid, dataTest)
    return validBER, testBER

In [32]:
testQ3()

(np.float64(0.14814935617142105), np.float64(0.14981452284202257))

In [18]:
def testQ4():
    testBER_noCat, testBER_noReview, testBER_noLength = homework2.Q4(1, catID, dataTrain, dataValid, dataTest)
    return testBER_noCat, testBER_noReview, testBER_noLength

In [33]:
testQ4()

(np.float64(0.333408575796558),
 np.float64(0.16096052587282522),
 np.float64(0.14898302643878575))

In [20]:
path = root + "/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [21]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [22]:
review_dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    review_dataset.append(d)

Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [23]:
reviewDataTrain = review_dataset[:int(len(review_dataset)*0.9)]
reviewDataTest = review_dataset[int(len(review_dataset)*0.9):]

In [24]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in reviewDataTrain:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)

for d in review_dataset:
    user,item = d['customer_id'], d['product_id']
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [25]:
def testQ5():
    # Note: your solution is autograded based on the relevance scores, not the items themselves
    return homework2.mostSimilar("B00KCHRKD6", 10, usersPerItem)

In [26]:
testQ5()

In [27]:
ratingMean = homework2.getMeanRating(reviewDataTrain)

userAverages = homework2.getUserAverages(itemsPerUser, ratingDict)

itemAverages = homework2.getItemAverages(usersPerItem, ratingDict)

In [28]:
def testQ6():
    alwaysPredictMean = [ratingMean for d in reviewDataTest]
    
    simPredictions = [homework2.predictRating(d['customer_id'],
                                              d['product_id'],
                                              ratingMean,
                                              reviewsPerUser,
                                              usersPerItem,
                                              itemsPerUser,
                                              userAverages,
                                              itemAverages) for d in reviewDataTest]

    labels = [d['star_rating'] for d in reviewDataTest]
    
    # Autograder checks the MSE of your predictions and (some of) the simPrediction values

In [29]:
testQ6()

In [30]:
def testQ7():
    alwaysPredictMean = [ratingMean for d in reviewDataTest]
    
    simPredictions = [homework2.predictRating(d['customer_id'],
                                              d['product_id'],
                                              ratingMean,
                                              reviewsPerUser,
                                              usersPerItem,
                                              itemsPerUser,
                                              userAverages,
                                              itemAverages) for d in reviewDataTest]
    

    q7Predictions = [homework2.predictRatingQ7(d['customer_id'],
                                               d['product_id'],
                                               ratingMean,
                                               reviewsPerUser,
                                               usersPerItem,
                                               itemsPerUser,
                                               userAverages,
                                               itemAverages) for d in reviewDataTest]
    
    labels = [d['star_rating'] for d in reviewDataTest]
    
    m1 = homework2.MSE(simPredictions, labels)
    m2 = homework2.MSE(q7Predictions, labels)
    m3 = homework2.MSE(alwaysPredictMean, labels)
    
    # Autograder checks whether your solution is better than either the Q6 or a naive solution
    return 1.0 * ((m2 < m1) and (m2 < m3))

In [31]:
testQ7()

1.0