In [14]:
from collections import defaultdict
from sklearn import linear_model
import numpy
import math

ModuleNotFoundError: No module named 'sklearn'

In [None]:
### Question 1

In [None]:
def getMaxLen(dataset):
    # Find the longest review (number of characters)
    # 找到数据集中最长的评论长度
    text_fields = ['review/text', 'text', 'review']
    lengths = [len(datum.get(field, '')) for datum in dataset for field in text_fields if field in datum]
    return max(lengths) if lengths else 0

In [None]:
def featureQ1(datum, maxLen):
    # Feature vector for one data point
    # 为单个数据点创建特征向量
    # Returns [1, scaled_length] where scaled_length is between 0 and 1
    # 返回 [1, 缩放长度] 其中缩放长度在0-1之间
    text_fields = ['review/text', 'text', 'review']
    text_len = next((len(datum[field]) for field in text_fields if field in datum), 0)
    scaled_length = text_len / maxLen if maxLen > 0 else 0
    return [1, scaled_length]

In [None]:
def Q1(dataset):
    # Implement...
    # Q1: 基础线性回归 - 用评论长度预测评分
    maxLen = getMaxLen(dataset)
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    
    # Create feature matrix X and label vector Y using list comprehensions
    # 创建特征矩阵X和标签向量Y
    X = [featureQ1(datum, maxLen) for datum in dataset]
    Y = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in dataset]
    
    # Convert to numpy arrays
    # 转换为numpy数组
    X, Y = numpy.array(X), numpy.array(Y)
    
    # Train linear regression model
    # 训练线性回归模型
    model = linear_model.LinearRegression()
    model.fit(X, Y)
    
    # Get theta parameters and calculate MSE
    # 返回参数和MSE
    theta = [model.intercept_, model.coef_[1]]
    MSE = numpy.mean((Y - model.predict(X)) ** 2)
    return theta, MSE

In [None]:
### Question 2

In [None]:
def featureQ2(datum, maxLen):
    # Implement (should be 1, length feature, day feature, month feature)
    # Q2特征向量：长度+时间特征(one-hot编码)
    # Returns [1, scaled_length, day_features(7), month_features(12)]
    # 返回 [1, 缩放长度, 星期几(7维), 月份(12维)]
    
    # Get text length and scale it
    # 获取文本长度并缩放
    text_fields = ['review/text', 'text', 'review']
    text_len = next((len(datum[field]) for field in text_fields if field in datum), 0)
    scaled_length = text_len / maxLen if maxLen > 0 else 0
    
    # Get parsed date for weekday and month
    # 获取解析后的日期
    parsed_date = datum.get('parsed_date')
    weekday = parsed_date.weekday() if parsed_date else 0  # 0=Monday, 6=Sunday
    month = parsed_date.month if parsed_date else 1  # 1-12
    
    # One-hot encoding for weekday (7 features)
    # 星期几的one-hot编码(7个特征)
    day_features = [1 if weekday == i else 0 for i in range(7)]
    
    # One-hot encoding for month (12 features)
    # 月份的one-hot编码(12个特征)
    month_features = [1 if month == i else 0 for i in range(1, 13)]
    
    return [1, scaled_length] + day_features + month_features

In [None]:
def Q2(dataset):
    # Implement (note MSE should be a *number*, not e.g. an array of length 1)
    # Q2: 多特征线性回归 - 长度+时间特征
    maxLen = getMaxLen(dataset)
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    
    # Create feature matrix X and label vector Y
    # 创建特征矩阵X和标签向量Y
    X2 = [featureQ2(datum, maxLen) for datum in dataset]
    Y2 = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in dataset]
    
    # Convert to numpy arrays
    # 转换为numpy数组
    X2, Y2 = numpy.array(X2), numpy.array(Y2)
    
    # Train linear regression model
    # 训练线性回归模型
    model = linear_model.LinearRegression()
    model.fit(X2, Y2)
    
    # Calculate MSE
    # 计算MSE
    MSE2 = numpy.mean((Y2 - model.predict(X2)) ** 2)
    
    return X2, Y2, MSE2

In [None]:
### Question 3

In [None]:
def featureQ3(datum, maxLen):
    # Implement
    # Q3特征向量：长度+时间特征(直接数值)
    # Returns [1, scaled_length, weekday, month] - direct numerical features
    # 返回 [1, 缩放长度, 星期几, 月份] - 直接数值特征
    
    # Get text length and scale it
    # 获取文本长度并缩放
    text_fields = ['review/text', 'text', 'review']
    text_len = next((len(datum[field]) for field in text_fields if field in datum), 0)
    scaled_length = text_len / maxLen if maxLen > 0 else 0
    
    # Get parsed date for weekday and month
    # 获取日期信息
    parsed_date = datum.get('parsed_date')
    weekday = parsed_date.weekday() if parsed_date else 0  # 0-6
    month = parsed_date.month if parsed_date else 1  # 1-12
    
    return [1, scaled_length, weekday, month]

In [None]:
def Q3(dataset):
    # Implement
    # Q3: 直接数值特征线性回归
    maxLen = getMaxLen(dataset)
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    
    # Create feature matrix X and label vector Y
    # 创建特征矩阵X和标签向量Y
    X3 = [featureQ3(datum, maxLen) for datum in dataset]
    Y3 = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in dataset]
    
    # Convert to numpy arrays
    # 转换为numpy数组
    X3, Y3 = numpy.array(X3), numpy.array(Y3)
    
    # Train linear regression model
    # 训练线性回归模型
    model = linear_model.LinearRegression()
    model.fit(X3, Y3)
    
    # Calculate MSE
    # 计算MSE
    MSE3 = numpy.mean((Y3 - model.predict(X3)) ** 2)
    
    return X3, Y3, MSE3

In [None]:
### Question 4

In [None]:
def Q4(dataset):
    # Implement
    # Q4: 训练/测试集分割和模型比较
    # Split data into 50%/50% train/test (first half / second half)
    # 50%/50%分割数据
    n = len(dataset)
    train_data, test_data = dataset[:n//2], dataset[n//2:]
    
    maxLen = getMaxLen(train_data)
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    
    # Train Q2 model (one-hot encoding)
    # 训练Q2模型(one-hot编码)
    X_train = [featureQ2(datum, maxLen) for datum in train_data]
    Y_train = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in train_data]
    X_train, Y_train = numpy.array(X_train), numpy.array(Y_train)
    model2 = linear_model.LinearRegression()
    model2.fit(X_train, Y_train)
    
    # Train Q3 model (direct numerical)
    # 训练Q3模型(直接数值)
    X_train3 = [featureQ3(datum, maxLen) for datum in train_data]
    Y_train3 = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in train_data]
    X_train3, Y_train3 = numpy.array(X_train3), numpy.array(Y_train3)
    model3 = linear_model.LinearRegression()
    model3.fit(X_train3, Y_train3)
    
    # Test on test data
    # 在测试集上评估
    X_test = [featureQ2(datum, maxLen) for datum in test_data]
    Y_test = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in test_data]
    X_test, Y_test = numpy.array(X_test), numpy.array(Y_test)
    
    X_test3 = [featureQ3(datum, maxLen) for datum in test_data]
    Y_test3 = [next((datum[field] for field in rating_fields if field in datum), 0) for datum in test_data]
    X_test3, Y_test3 = numpy.array(X_test3), numpy.array(Y_test3)
    
    # Calculate test MSE
    # 计算测试集MSE
    test_mse2 = numpy.mean((Y_test - model2.predict(X_test)) ** 2)
    test_mse3 = numpy.mean((Y_test3 - model3.predict(X_test3)) ** 2)
    
    return test_mse2, test_mse3

In [None]:
### Question 5

In [None]:
def featureQ5(datum):
    # Implement
    # Q5特征向量：啤酒评论分类
    # Returns [1, length] for beer review classification
    # 返回 [1, 长度] 用于啤酒评论分类
    text_fields = ['review/text', 'text', 'review']
    text_len = next((len(datum[field]) for field in text_fields if field in datum), 0)
    return [1, text_len]

In [None]:
def Q5(dataset, feat_func):
    # Implement
    # Q5: 逻辑回归分类 - 啤酒评价正面/负面
    # Create binary labels: 1 if rating >= 4, 0 otherwise
    # 创建二分类标签：评分>=4为正面(1)，否则为负面(0)
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    y = [1 if next((datum[field] for field in rating_fields if field in datum), 0) >= 4 else 0 for datum in dataset]
    
    # Create feature matrix
    # 创建特征矩阵
    X = [feat_func(datum) for datum in dataset]
    X, y = numpy.array(X), numpy.array(y)
    
    # Train logistic regression with balanced class weights
    # 训练逻辑回归模型(平衡类别权重)
    model = linear_model.LogisticRegression(class_weight='balanced', random_state=42)
    model.fit(X, y)
    
    # Make predictions
    # 进行预测
    y_pred = model.predict(X)
    
    # Calculate confusion matrix components
    # 计算混淆矩阵
    TP = numpy.sum((y == 1) & (y_pred == 1))  # True Positives
    TN = numpy.sum((y == 0) & (y_pred == 0))  # True Negatives
    FP = numpy.sum((y == 0) & (y_pred == 1))  # False Positives
    FN = numpy.sum((y == 1) & (y_pred == 0))  # False Negatives
    
    # Calculate Balanced Error Rate
    # 计算平衡错误率
    BER = 0.5 * (FP / (FP + TN) + FN / (FN + TP)) if (FP + TN) > 0 and (FN + TP) > 0 else 0.5
    
    return TP, TN, FP, FN, BER

In [None]:
### Question 6

In [None]:
def Q6(dataset):
    # Implement
    # Q6: 计算Precision@K
    # Create binary labels and features
    # 创建二分类标签和特征
    rating_fields = ['rating', 'overall', 'stars', 'review/overall']
    y = [1 if next((datum[field] for field in rating_fields if field in datum), 0) >= 4 else 0 for datum in dataset]
    X = [featureQ5(datum) for datum in dataset]
    X, y = numpy.array(X), numpy.array(y)
    
    # Train logistic regression
    # 训练逻辑回归模型
    model = linear_model.LogisticRegression(class_weight='balanced', random_state=42)
    model.fit(X, y)
    
    # Get prediction probabilities
    # 获取预测概率
    y_proba = model.predict_proba(X)[:, 1]  # Probability of positive class
    
    # Calculate Precision@K for K in {1, 100, 1000, 10000}
    # 计算Precision@K for K in {1, 100, 1000, 10000}
    K_values = [1, 100, 1000, 10000]
    precs = []
    
    for K in K_values:
        # Get top K predictions
        # 获取前K个预测
        top_k_indices = numpy.argsort(y_proba)[-K:]
        top_k_labels = y[top_k_indices]
        
        # Calculate precision@K
        # 计算precision@K
        precision_k = numpy.sum(top_k_labels) / K if K > 0 else 0
        precs.append(precision_k)
    
    return precs

In [None]:
### Question 7

In [None]:
def featureQ7(datum):
    # Implement (any feature vector which improves performance over Q5)
    # Q7特征向量：改进的特征工程
    # Enhanced features: length, beer style, ABV, appearance, aroma, taste, palate
    # 增强特征：长度、啤酒风格、酒精度、各项评分
    text_fields = ['review/text', 'text', 'review']
    text_len = next((len(datum[field]) for field in text_fields if field in datum), 0)
    
    # Beer style (one-hot encoding for common styles)
    # 啤酒风格(one-hot编码)
    style = datum.get('beer/style', 'Unknown')
    common_styles = ['IPA', 'Stout', 'Porter', 'Lager', 'Ale', 'Wheat', 'Pilsner', 'Saison']
    style_features = [1 if style in common_style else 0 for common_style in common_styles]
    
    # ABV (alcohol by volume)
    # 酒精度
    abv = datum.get('beer/ABV', 0)
    
    # Review scores (appearance, aroma, taste, palate)
    # 各项评分
    appearance = datum.get('review/appearance', 0)
    aroma = datum.get('review/aroma', 0)
    taste = datum.get('review/taste', 0)
    palate = datum.get('review/palate', 0)
    
    # Text length features (log, sqrt)
    # 文本长度变换特征
    log_length = numpy.log(text_len + 1)
    sqrt_length = numpy.sqrt(text_len)
    
    return [1, text_len, abv, appearance, aroma, taste, palate, log_length, sqrt_length] + style_features