In [1]:
from collections import defaultdict
from sklearn import linear_model
import numpy
import math

In [2]:
def feat(d, catID, maxLength, includeCat = True, includeReview = True, includeLength = True):
    feat = []
    if includeCat:
        # My implementation is modular such that this one function concatenates all three features together,
        # depending on which are selected
        
        # One-hot encoding for beer style
        style = d['beer/style']
        onehot = [0] * len(catID)
        if style in catID:
            onehot[catID[style]] = 1
        feat += onehot
    if includeReview:
        review_overall = d['review/aroma']
        feat.append(review_overall)
    if includeLength:
        # Normalized review length
        review_length = len(d['review/text'])
        normalized_length = review_length / maxLength if maxLength > 0 else 0
        feat.append(normalized_length)

    return feat + [1]

In [3]:
def BER(yTrue, yPred):
    # Balanced Error Rate: average of false positive rate and false negative rate
    from sklearn.metrics import confusion_matrix
    tn, fp, fn, tp = confusion_matrix(yTrue, yPred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    return 0.5 * (fpr + fnr)

In [4]:
def pipeline(reg, catID, dataTrain, dataValid, dataTest, includeCat=True, includeReview=True, includeLength=True):
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced')

    maxLength = max([len(d['review/text']) for d in dataTrain])
    
    Xtrain = [feat(d, catID, maxLength, includeCat, includeReview, includeLength) for d in dataTrain]
    Xvalid = [feat(d, catID, maxLength, includeCat, includeReview, includeLength) for d in dataValid]
    Xtest = [feat(d, catID, maxLength, includeCat, includeReview, includeLength) for d in dataTest]
    
    yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
    yValid = [d['beer/ABV'] > 7 for d in dataValid]
    yTest = [d['beer/ABV'] > 7 for d in dataTest]
    
    # (1) Fit the model on the training set
    mod.fit(Xtrain, yTrain)

    # (2) Compute validation BER
    yValidPred = mod.predict(Xvalid)
    yTestPred = mod.predict(Xtest)

    # (3) Compute test BER
    vBER = BER(yValid, yValidPred)
    tBER = BER(yTest, yTestPred)
    
    return mod, vBER, tBER

In [5]:
### Question 1

In [6]:
def Q1(catID, dataTrain, dataValid, dataTest):
    # No need to modify this if you've implemented the functions above
    mod, validBER, testBER = pipeline(10, catID, dataTrain, dataValid, dataTest, True, False, False)
    return mod, validBER, testBER

In [7]:
### Question 2

In [8]:
def Q2(catID, dataTrain, dataValid, dataTest):
    mod, validBER, testBER = pipeline(10, catID, dataTrain, dataValid, dataTest, True, True, True)
    return mod, validBER, testBER

In [9]:
### Question 3

In [10]:
def Q3(catID, dataTrain, dataValid, dataTest):
    # Your solution here...
    
    maxLength = max([len(d['review/text']) for d in dataTrain])
    bestBER = float('inf')
    bestModel = None
    bestValidBER = None
    bestTestBER = None
    
    for c in [0.001, 0.01, 0.1, 1, 10]:
        mod = linear_model.LogisticRegression(C=c, class_weight='balanced', max_iter=1000)

        Xtrain = [feat(d, catID, maxLength) for d in dataTrain]
        Xvalid = [feat(d, catID, maxLength) for d in dataValid]
        Xtest = [feat(d, catID, maxLength) for d in dataTest]

        yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
        yValid = [d['beer/ABV'] > 7 for d in dataValid]
        yTest = [d['beer/ABV'] > 7 for d in dataTest]

        mod.fit(Xtrain, yTrain)
        
        vBER = BER(yValid, mod.predict(Xvalid))
        tBER = BER(yTest, mod.predict(Xtest))

        if vBER < bestBER:
            bestBER = vBER
            bestModel = mod
            bestValidBER = vBER
            bestTestBER = tBER

    # Return the validBER and testBER for the model that works best on the validation set
    return bestModel, bestValidBER, bestTestBER
    # return mod, validBER, testBER

In [11]:
### Question 4

In [12]:
def Q4(C, catID, dataTrain, dataValid, dataTest):
    mod, validBER, testBER_noCat = pipeline(C, catID, dataTrain, dataValid, dataTest, False, True, True)
    mod, validBER, testBER_noReview = pipeline(C, catID, dataTrain, dataValid, dataTest, True, False, True)
    mod, validBER, testBER_noLength = pipeline(C, catID, dataTrain, dataValid, dataTest, True, True, False)
    return testBER_noCat, testBER_noReview, testBER_noLength

In [13]:
### Question 5

In [14]:
def Jaccard(s1, s2):
    # Jaccard similarity = |intersection| / |union|
    intersection = s1 & s2
    union = s1 | s2
    return len(intersection) / len(union) if len(union) > 0 else 0

In [15]:
def mostSimilar(i, N, usersPerItem):
    similarities = []
    users_i = usersPerItem[i]

    for j in usersPerItem:
        if j == i:
            continue
        sim = Jaccard(users_i, usersPerItem[j])
        similarities.append((sim, j))

    # Sort by similarity in descending order
    similarities.sort(reverse=True)
    
    # Should be a list of (similarity, itemID) pairs
    return similarities[:N]

In [16]:
### Question 6

In [17]:
def MSE(y, ypred):
    # Implement...
    return sum((a - b) ** 2 for a, b in zip(y, ypred)) / len(y)

In [18]:
def getMeanRating(dataTrain):
    # Implement...
    total = sum(d['star_rating'] for d in dataTrain)
    return total / len(dataTrain)

def getUserAverages(itemsPerUser, ratingDict):
    # Implement (should return a dictionary mapping users to their averages)
    userSum = defaultdict(float)
    userCount = defaultdict(int)

    for (user, item), rating in ratingDict.items():
        userSum[user] += rating
        userCount[user] += 1

    userAverages = {user: userSum[user] / userCount[user] for user in userSum}
    return userAverages

def getItemAverages(usersPerItem, ratingDict):
    # Implement...
    itemSum = defaultdict(float)
    itemCount = defaultdict(int)

    for (user, item), rating in ratingDict.items():
        itemSum[item] += rating
        itemCount[item] += 1

    itemAverages = {item: itemSum[item] / itemCount[item] for item in itemSum}
    return itemAverages

In [19]:
def predictRating(user,item,ratingMean,reviewsPerUser,usersPerItem,itemsPerUser,userAverages,itemAverages):
    # Solution for Q6, should return a rating
    if item not in itemAverages:
        return ratingMean  # fallback to global average if item unseen

    numerator = 0.0
    denominator = 0.0

    for review in reviewsPerUser[user]:
        j = review['product_id']
        if j == item:
            continue
        users_i = usersPerItem.get(item, set())
        users_j = usersPerItem.get(j, set())
        sim = len(users_i & users_j) / len(users_i | users_j) if users_i | users_j else 0

        if sim > 0:
            ruj = review['star_rating']
            rj = itemAverages.get(j, ratingMean)
            numerator += (ruj - rj) * sim
            denominator += sim

    if denominator == 0:
        return itemAverages[item]  # fallback to item average

    return itemAverages[item] + numerator / denominator

In [20]:
### Question 7

In [22]:
def predictRatingQ7(user, item, ratingMean, reviewsPerUser, usersPerItem, itemsPerUser, userAverages, itemAverages):
    # Fallbacks
    itemAvg = itemAverages.get(item, None)
    userAvg = userAverages.get(user, None)

    # If item is unseen, fallback to user average or global mean
    if itemAvg is None:
        return userAvg if userAvg is not None else ratingMean

    numerator = 0.0
    denominator = 0.0

    # Similarity-weighted deviation from item averages
    for review in reviewsPerUser[user]:
        j = review['product_id']
        if j == item:
            continue
        users_i = usersPerItem.get(item, set())
        users_j = usersPerItem.get(j, set())
        sim = len(users_i & users_j) / len(users_i | users_j) if users_i | users_j else 0

        if sim > 0:
            ruj = review['star_rating']
            rj = itemAverages.get(j, ratingMean)
            numerator += (ruj - rj) * sim
            denominator += sim

    # If similarity signal is strong, use it
    if denominator > 0:
        return itemAvg + numerator / denominator

    # Otherwise, blend item and user averages if both exist
    if userAvg is not None:
        return 0.5 * itemAvg + 0.5 * userAvg

    # Final fallback
    return itemAvg
