In [1]:
from collections import defaultdict
from sklearn import linear_model, metrics
import numpy
import math

In [2]:
### Question 1

In [3]:
def getMaxLen(dataset):
    # Find the longest review (number of characters)
    return max([len(datum['review_text']) for datum in dataset])

In [4]:
def featureQ1(datum, maxLen):
    # Feature vector for one data point
    scaled_review_length = len(datum['review_text']) / maxLen
    return [1, scaled_review_length]

In [5]:
def Q1(dataset):
    # Implement...

    max_review_length = getMaxLen(dataset)
    X, Y = [], []
    for datum in dataset:
        X.append(featureQ1(datum, max_review_length))
        Y.append(datum['rating'])

    X = numpy.array(X)
    Y = numpy.array(Y)

    theta = numpy.linalg.inv(X.T @ X) @ X.T @ Y

    # Predictions
    Y_pred = X @ theta

    # Mean Squared Error
    MSE = numpy.mean((Y_pred - Y) ** 2)

    return theta, MSE

In [6]:
### Question 2

In [7]:
def featureQ2(datum, maxLen):
    # Implement (should be 1, length feature, day feature, month feature)

    vec = []

    # 1. Bias term
    vec.append(1)

    # 2. Scaled review length
    scaled_review_length = len(datum['review_text']) / maxLen
    vec.append(scaled_review_length)

    # 3. One Hot Encoding for Weekday - drop Monday
    weekday = datum['parsed_date'].weekday()  # Monday=0, .., Sunday=6
    for i in range(1, 6+1):
        vec.append(1 if weekday==i else 0)

    # 4. One Hot Encoding for Month - drop January
    month = datum['parsed_date'].month  # January=1, ..., December=12
    for i in range(2, 12+1):
        vec.append(1 if month==i else 0)

    return vec

In [8]:
def Q2(dataset):
    # Implement (note MSE should be a *number*, not e.g. an array of length 1)

    maxLen = getMaxLen(dataset)
    X2, Y2 = [], []
    for datum in dataset:
        X2.append(featureQ2(datum, maxLen))
        Y2.append(datum['rating'])
    X2 = numpy.array(X2)
    Y2 = numpy.array(Y2)

    # Train One-Hot Encoded model
    theta = numpy.linalg.inv(X2.T @ X2) @ X2.T @ Y2

    # Predictions
    Y_pred = X2 @ theta

    # Mean Squared Error (as a scalar)
    MSE2 = numpy.mean((Y_pred - Y2) ** 2)

    return X2, Y2, MSE2

In [9]:
### Question 3

In [10]:
def featureQ3(datum, maxLen):
    # Implement
    scaled_len = len(datum['review_text']) / maxLen
    weekday = datum['parsed_date'].weekday()  # 0 = Monday, .., 6 = Sunday
    month = datum['parsed_date'].month        # 1 = January, .., 12 = December

    # Q3a: raw values
    raw_features = [1, scaled_len, weekday, month]

    return raw_features

In [11]:
def Q3(dataset):
    # Implement
    maxLen = getMaxLen(dataset)
    X3, Y3 = [], []
    for datum in dataset:
        X3.append(featureQ3(datum, maxLen))
        Y3.append(datum['rating'])
    X3 = numpy.array(X3)
    Y3 = numpy.array(Y3)

    # Train raw model
    theta = numpy.linalg.inv(X3.T @ X3) @ X3.T @ Y3

    # Predictions
    Y_pred = X3 @ theta

    # Mean Squared Error
    MSE3 = numpy.mean((Y_pred - Y3) ** 2)
    
    return X3, Y3, MSE3

In [12]:
### Question 4

In [13]:
def Q4(dataset):
   # Implement
    train_data = dataset[:len(dataset)//2]
    test_data = dataset[len(dataset)//2:]

    maxLen_train = getMaxLen(train_data)
    maxLen_test = getMaxLen(test_data)

    # CASE: one hot enconding
    X2_train, Y2_train = [], []
    for datum in train_data:
        X2_train.append(featureQ2(datum, maxLen_train))
        Y2_train.append(datum['rating'])
    X2_train = numpy.array(X2_train)
    Y2_train = numpy.array(Y2_train)

    X2_test, Y2_test = [], []
    for datum in test_data:
        X2_test.append(featureQ2(datum, maxLen_test))
        Y2_test.append(datum['rating'])
    X2_test = numpy.array(X2_test)
    Y2_test = numpy.array(Y2_test)

    # Train One Hot Encoding model
    # Linear regression: theta = (X^T X)^(-1) X^T y
    theta2_train = numpy.linalg.inv(X2_train.T @ X2_train) @ X2_train.T @ Y2_train

    # Predictions
    Y2_pred_test = X2_test @ theta2_train

    # Mean Squared Error (as a scalar)
    test_mse2 = numpy.mean((Y2_pred_test - Y2_test) ** 2)

    # CASE: No enconding
    X3_train, Y3_train = [], []
    for datum in train_data:
        X3_train.append(featureQ3(datum, maxLen_train))
        Y3_train.append(datum['rating'])
    X3_train = numpy.array(X3_train)
    Y3_train = numpy.array(Y3_train)

    X3_test, Y3_test = [], []
    for datum in test_data:
        X3_test.append(featureQ3(datum, maxLen_test))
        Y3_test.append(datum['rating'])
    X3_test = numpy.array(X3_test)
    Y3_test = numpy.array(Y3_test)

    # Train raw (no encoding) model
    # Linear regression: theta = (X^T X)^(-1) X^T y
    theta3_train = numpy.linalg.inv(X3_train.T @ X3_train) @ X3_train.T @ Y3_train

    # Predictions
    Y3_pred_test = X3_test @ theta3_train

    # Mean Squared Error (as a scalar)
    test_mse3 = numpy.mean((Y3_pred_test - Y3_test) ** 2)

    return test_mse2, test_mse3

In [14]:
### Question 5

In [15]:
def featureQ5(datum):
    # Implement
    return [len(datum['review/text'])]

In [16]:
def Q5(dataset, feat_func):
    # Implement
    
    # Step 1: Extract features and labels
    # Feature: review length
    X = [feat_func(datum) for datum in dataset]
    # Label: whether review score >= 4
    y = [1 if datum['review/overall'] >= 4.0 else 0 for datum in dataset]

    # Step 2: Initialize and fit the logistic regressor
    model = linear_model.LogisticRegression(class_weight='balanced')
    model.fit(X, y)

    # Step 3: Predictions
    y_pred = model.predict(X)

    # Step 4: Calculate confusion matrix
    TN, FP, FN, TP = metrics.confusion_matrix(y, y_pred).ravel()

    # Step 5: Compute the Balanced Error Rate (BER)
    BER = (FP / (FP + TN) + FN / (FN + TP)) / 2.0

    return TP, TN, FP, FN, BER

In [17]:
### Question 6

In [18]:
def Q6(dataset):
    # Implement
    
    # Extract features and labels
    # Feature: review length
    X = [featureQ5(datum) for datum in dataset]
    # Label: whether review score >= 4
    y = [1 if datum['review/overall'] >= 4.0 else 0 for datum in dataset]

    # Train the model
    model = linear_model.LogisticRegression(class_weight='balanced')
    model.fit(X, y)

    # Calculate predicted probabilities and sort
    probs = model.predict_proba(X)[:, 1]
    sorted_indices = probs.argsort()[::-1]

    # Calculate Precision@K for different K values
    precisions = []
    for K in [1, 100, 1000, 10000]:
        top_K_indices = sorted_indices[:K]
        correct = sum([y[i] for i in top_K_indices])
        precision_at_K = correct / K
        precisions.append(precision_at_K)

    return precisions

In [19]:
### Question 7

In [20]:
def featureQ7(datum):
    # Implement (any feature vector which improves performance over Q5)
    review_length = len(datum['review/text'])
    abv = datum['beer/ABV']
    aroma = datum['review/aroma']
    return [review_length, abv, aroma]