In [59]:
import numpy as np
from sklearn.linear_model import LinearRegression
from dateutil import parser as _dateparser
from collections import defaultdict
from sklearn import linear_model
import numpy
import math
from sklearn.linear_model import LogisticRegression


In [60]:
### Question 1

In [61]:
def getMaxLen(dataset):
    maxLen = 0
    for d in dataset or []:
        txt = d.get('reviewText') or d.get('review_text') or d.get('text') or ''
        maxLen = max(maxLen, len(str(txt)))
    return maxLen

In [62]:
def featureQ1(datum, maxLen):
    txt = datum.get('reviewText') or datum.get('review_text') or datum.get('text') or ''
    L = len(str(txt))
    norm_len = (L / maxLen) if maxLen else 0.0
    return np.array([1.0, norm_len], dtype=float)

In [63]:
def Q1(dataset):
    maxLen = getMaxLen(dataset)
    X, y = [], []
    for d in dataset or []:
        rating = d.get('rating', d.get('overall', d.get('stars')))
        if rating is None:
            continue
        X.append(featureQ1(d, maxLen))
        y.append(float(rating))

    if not X:
        return np.zeros(2), float('nan')

    X = np.vstack(X)
    y = np.array(y, dtype=float)

    lr = LinearRegression(fit_intercept=False)
    lr.fit(X, y)

    theta = lr.coef_
    preds = X @ theta
    MSE = float(np.mean((preds - y) ** 2))
    return theta, MSE

In [64]:
### Question 2

In [65]:
def featureQ2(datum, maxLen):
    s = str(datum.get('reviewText') or datum.get('review_text') or datum.get('text') or '')
    norm_len = (len(s) / maxLen) if maxLen else 0.0

    dt = datum.get('parsed_date')
    if dt is None:
        raw = (datum.get('date_added') or datum.get('reviewTime') or
               datum.get('review_time') or datum.get('date') or datum.get('review_date'))
        if raw:
            try:
                dt = _dateparser.parse(str(raw))
            except Exception:
                dt = None

    w_onehot = [0.0]*6   
    m_onehot = [0.0]*11  
    if dt is not None:
        w = dt.weekday()        
        m = dt.month           
        if 0 <= w <= 5: w_onehot[w] = 1.0
        if 1 <= m <= 11: m_onehot[m-1] = 1.0

    return np.array([1.0, float(norm_len)] + w_onehot + m_onehot, dtype=float)

In [66]:
def Q2(dataset):
    maxLen = getMaxLen(dataset)
    X, Y = [], []
    for d in dataset or []:
        y = d.get('rating', d.get('overall', d.get('stars')))
        if y is None: 
            continue
        X.append(featureQ2(d, maxLen))
        Y.append(float(y))
    X2 = np.vstack(X) if X else np.zeros((0, 19))
    Y2 = np.array(Y, dtype=float)
    if len(Y2) == 0:
        return X2, Y2, float('nan')
    lr = LinearRegression(fit_intercept=False)
    lr.fit(X2, Y2)
    MSE2 = float(np.mean((X2 @ lr.coef_ - Y2)**2))
    return X2, Y2, MSE2

In [67]:
### Question 3

In [68]:
def featureQ3(datum, maxLen):
    s = str(datum.get('reviewText') or datum.get('review_text') or datum.get('text') or '')
    norm_len = (len(s) / maxLen) if maxLen else 0.0

    dt = datum.get('parsed_date')
    if dt is None:
        raw = (datum.get('date_added') or datum.get('reviewTime') or
               datum.get('review_time') or datum.get('date') or datum.get('review_date'))
        if raw:
            try:
                dt = _dateparser.parse(str(raw))
            except Exception:
                dt = None

    if dt is None:
        w, m = 0, 1
    else:
        w, m = dt.weekday(), dt.month

    return np.array([1.0, float(norm_len), float(w), float(m)], dtype=float)

In [69]:
def Q3(dataset):
    maxLen = getMaxLen(dataset)
    X, Y = [], []
    for d in dataset or []:
        y = d.get('rating', d.get('overall', d.get('stars')))
        if y is None: 
            continue
        X.append(featureQ3(d, maxLen))
        Y.append(float(y))
    X3 = np.vstack(X) if X else np.zeros((0, 4))
    Y3 = np.array(Y, dtype=float)
    if len(Y3) == 0:
        return X3, Y3, float('nan')
    lr = LinearRegression(fit_intercept=False)
    lr.fit(X3, Y3)
    MSE3 = float(np.mean((X3 @ lr.coef_ - Y3)**2))
    return X3, Y3, MSE3

In [70]:
### Question 4

In [71]:
def Q4(dataset):
    n = len(dataset)
    split = n // 2
    train, test = dataset[:split], dataset[split:]

    maxLen_tr = getMaxLen(train)

    # Train Q2
    X2_tr = np.vstack([featureQ2(d, maxLen_tr) for d in train]) if train else np.zeros((0,19))
    y_tr  = np.array([float(d.get('rating', d.get('overall', d.get('stars')))) for d in train], dtype=float)
    lr2 = LinearRegression(fit_intercept=False)
    if len(y_tr): lr2.fit(X2_tr, y_tr)

    # Train Q3
    X3_tr = np.vstack([featureQ3(d, maxLen_tr) for d in train]) if train else np.zeros((0,4))
    lr3 = LinearRegression(fit_intercept=False)
    if len(y_tr): lr3.fit(X3_tr, y_tr)

    # Test using train normalization
    y_te  = np.array([float(d.get('rating', d.get('overall', d.get('stars')))) for d in test], dtype=float)
    X2_te = np.vstack([featureQ2(d, maxLen_tr) for d in test]) if test else np.zeros((0,19))
    X3_te = np.vstack([featureQ3(d, maxLen_tr) for d in test]) if test else np.zeros((0,4))

    pred2 = X2_te @ getattr(lr2, 'coef_', np.zeros(19))
    pred3 = X3_te @ getattr(lr3, 'coef_', np.zeros(4))
    test_mse2 = float(np.mean((pred2 - y_te)**2)) if len(y_te) else float('nan')
    test_mse3 = float(np.mean((pred3 - y_te)**2)) if len(y_te) else float('nan')
    return test_mse2, test_mse3

In [72]:
### Question 5

In [73]:
def featureQ5(datum):
    s = str(datum.get('reviewText') or datum.get('review_text') or datum.get('text') or '')
    return np.array([1.0, float(len(s)), float(s.count('!'))], dtype=float)

In [74]:
def Q5(dataset, feat_func):
    X, y = [], []
    for d in dataset or []:
        lab = d.get('label', d.get('y', d.get('target')))
        if lab is None:
            continue
        if isinstance(lab, str):
            lab = 1 if lab.lower() in ('1','pos','positive','true','yes') else 0
        y.append(int(lab))
        X.append(feat_func(d))
    if not X:
        return 0,0,0,0,float('nan')

    X = np.vstack(X); y = np.array(y, dtype=int)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X, y)
    yp = clf.predict(X)

    TP = int(((yp==1)&(y==1)).sum())
    TN = int(((yp==0)&(y==0)).sum())
    FP = int(((yp==1)&(y==0)).sum())
    FN = int(((yp==0)&(y==1)).sum())
    P  = max(int((y==1).sum()), 1)
    N  = max(int((y==0).sum()), 1)
    BER = 0.5*((FN/P) + (FP/N))
    return TP, TN, FP, FN, float(BER)

In [75]:
### Question 6

In [76]:
def Q6(dataset):
    X, y = [], []
    for d in dataset or []:
        lab = d.get('label', d.get('y', d.get('target')))
        if lab is None:
            continue
        if isinstance(lab, str):
            lab = 1 if lab.lower() in ('1','pos','positive','true','yes') else 0
        y.append(int(lab))
        X.append(featureQ5(d))
    if not X:
        return []

    X = np.vstack(X); y = np.array(y, dtype=int)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X, y)
    scores = clf.predict_proba(X)[:,1] if hasattr(clf,'predict_proba') else clf.decision_function(X)

    order = np.argsort(-scores)
    y_sorted = y[order]
    K = min(100, len(y_sorted))
    precs, tp = [], 0
    for k in range(1, K+1):
        if y_sorted[k-1] == 1:
            tp += 1
        precs.append(tp / k)
    return precs

In [77]:
### Question 7

In [78]:
def featureQ7(datum):
    s = str(datum.get('reviewText') or datum.get('review_text') or datum.get('text') or '')
    toks = [t.strip(".,!?;:()[]{}'\"").lower() for t in s.split() if t]

    pos = {"good","great","excellent","amazing","love","loved","awesome",
           "fantastic","perfect","best","wonderful","favorite","happy"}
    neg = {"bad","terrible","awful","hate","hated","worst","poor",
           "disappointing","boring","broken","sad","angry"}

    pos_cnt = float(sum(t in pos for t in toks))
    neg_cnt = float(sum(t in neg for t in toks))
    bal     = pos_cnt - neg_cnt
    caps_ratio = sum(1 for ch in s if ch.isalpha() and ch.isupper()) / (1.0 + len(s))
    digits = float(sum(ch.isdigit() for ch in s))

    return np.array([1.0, float(len(s)), float(s.count('!')), float(s.count('?')),
                     pos_cnt, neg_cnt, bal, digits, float(caps_ratio)], dtype=float)