Homework 4: Sentiment Analysis - Task 4
----

Names
----
Names: __YOUR NAMES HERE__ (Write these in every notebook you submit)

Task 4: Neural Networks (20 points)
----

Next, we'll train a feedforward neural net to work with this data. You'll train one neural net which takes the same input as your Logistic Regression model - a sparse vector representing documents as bags of words.

Take a look at these videos to understand forward and backward propagation in neural networks - 
* https://www.youtube.com/watch?v=HHbjpDHcJVw
* https://youtu.be/-Lavz_I4l2U?si=zi20DB3qKPLMEPt1

You will be implementing **binarized** (presence/absence of word) and **multinomial** (counts of word) BoW representations of your data
  
**10 points in Task 5 will be allocated for all 9 graphs (including the one generated here in Task 4 for Neural Networks) being:**
- Legible
- Present below
- Properly labeled
     - x and y axes labeled
     - Legend for accuracy measures plotted
     - Plot Title with which model and run number the graph represents

In [13]:
import sentiment_utils as sutils
import numpy as np

from keras.models import Sequential
from keras.layers import Dense

# you can experiment with having some Dropout layers if you'd like to
# this is not required
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD

# if you want to use this again
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer


Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


AttributeError: `np.complex_` was removed in the NumPy 2.0 release. Use `np.complex128` instead.

In [None]:
# define constants for the files we are using
TRAIN_FILE = "movie_reviews_train.txt"
DEV_FILE = "movie_reviews_dev.txt"

# load in your data and make sure you understand the format
# Do not print out too much so as to impede readability of your notebook
# train_tups = sutils.generate_tuples_from_file(TRAIN_FILE)
# dev_tups = sutils.generate_tuples_from_file(DEV_FILE)
train_X, train_y = sutils.generate_tuples_from_file(TRAIN_FILE)
dev_X,   dev_y   = sutils.generate_tuples_from_file(DEV_FILE)

# you may use either your sparse vectors or sklearn's CountVectorizer's sparse vectors
# you will experiment with multinomial and binarized representations later

# 基于训练集构建词表（仅当使用自定义向量化时才用到）
vocab = sutils.create_index(train_X, min_freq=1)

def vectorize(use_count_vectorizer: bool = True, binary: bool = False, sub_X=None):
    """
    返回 (X_train, X_dev, vectorizer)。默认使用 CountVectorizer。
    - binary=False -> multinomial (计数)
    - binary=True  -> binarized (存在即1)
    """
    if sub_X is None:
        sub_X = train_X
    
    if use_count_vectorizer:
        # 把 tokens 重新拼回字符串
        tr_docs = [" ".join(t) for t in sub_X]
        dv_docs = [" ".join(t) for t in dev_X]
        cv = CountVectorizer(binary=binary)
        X_tr = cv.fit_transform(tr_docs)
        X_dv = cv.transform(dv_docs)
        return X_tr, X_dv, cv
    else:
        # —— 可选：自定义向量化（使用我们在 sentiment_utils.py 的 featurize + DictVectorizer）
        tr_feats = sutils.featurize(vocab, sub_X, binary=binary)
        dv_feats = sutils.featurize(vocab, dev_X, binary=binary)
        dv = DictVectorizer(sparse=True)
        X_tr = dv.fit_transform(tr_feats)
        X_dv = dv.transform(dv_feats)
        return X_tr, X_dv, dv

In [8]:
# Create a feedforward neural network model
# that takes a sparse BoW representation of the data as input
# and makes a binary classification of positive/negative sentiment as output
# you may use any number of hidden layers >= 1 and any number of units in each hidden layer (we recommend between 50-200)
# you may use any activation function on the hidden layers 
# you should use a sigmoid activation function on the output layer
# you should use binary cross-entropy as your loss function
# sgd is an appropriate optimizer for this task
# you should report accuracy as your metric
# you may add Dropout layers if you'd like to

# create/compile your model in this cell

def build_model(input_dim: int,
                hidden_units: int = 128,
                hidden_layers: int = 1,
                dropout: float = 0.2):
    """
    前馈网络：
    - 隐含层：ReLU，默认1层、128单元，可加 Dropout
    - 输出层：sigmoid（二分类）
    - 损失：binary_crossentropy
    - 优化器：SGD (momentum=0.9)
    - 指标：accuracy
    """
    model = Sequential()
    # 第一层需要 input_dim
    model.add(Dense(hidden_units, activation='relu', input_shape=(input_dim,)))
    if dropout and dropout > 0:
        model.add(Dropout(dropout))
    # 额外隐藏层
    for _ in range(hidden_layers - 1):
        model.add(Dense(hidden_units, activation='relu'))
        if dropout and dropout > 0:
            model.add(Dropout(dropout))
    # 输出层
    model.add(Dense(1, activation='sigmoid'))
    # 编译
    model.compile(
        loss='binary_crossentropy',
        optimizer=SGD(learning_rate=0.05, momentum=0.9),
        metrics=['accuracy']
    )
    return model

# demo：先用整套训练集的词袋维度演示 summary（真正训练前会再次构建）
_tmp_Xtr, _tmp_Xdv, _tmp_vec = vectorize(use_count_vectorizer=True, binary=False, sub_X=train_X)
model = build_model(input_dim=_tmp_Xtr.shape[1])


# put in an output layer


model.summary()
# call compile here


NameError: name 'CountVectorizer' is not defined

How many trainable parameters does your model have? __YOUR ANSWER HERE__

In [1]:
# train your model
# 单次训练 + 评估（默认 multinomial + CountVectorizer）
# 可改动的开关：
USE_COUNTVECT = True   # True=CountVectorizer；False=自定义 DictVectorizer
BINARY_FEATS  = False  # False=multinomial（计数）；True=binarized（存在即1）
EPOCHS        = 8
BATCH_SIZE    = 32

# （可选）子采样一定比例，例如 100 表示用全部训练数据
PERCENT = 100
sub_train_X, sub_train_y = sutils.take_percent(train_X, train_y, pct=PERCENT, shuffle=True, seed=1)

# 向量化
X_tr, X_dv, vec = vectorize(use_count_vectorizer=USE_COUNTVECT, binary=BINARY_FEATS, sub_X=sub_train_X)

# Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})
# indicates you should change a list into a numpy array

# Keras 训练需要 ndarray；将稀疏矩阵 toarray，并把 y 转为 float32 ndarray
X_tr_arr = X_tr.astype('float32').toarray()
X_dv_arr = X_dv.astype('float32').toarray()
y_tr_arr = np.array(sub_train_y, dtype='float32')

In [None]:
# make a prediction on the dev set
# then make a classification decision based on that prediction

# 重新构建模型（很重要：每次实验都要从“未训练”模型开始）
model = build_model(input_dim=X_tr_arr.shape[1], hidden_units=128, hidden_layers=1, dropout=0.2)

# 训练
history = model.fit(X_tr_arr, y_tr_arr, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)

# 预测与评估
dev_probs = model.predict(X_dv_arr, verbose=0).reshape(-1)
dev_preds = (dev_probs >= 0.5).astype(int).tolist()

prec, rec, f1, acc = sutils.get_prfa(dev_y, dev_preds, verbose=True)
print(f"[Dev] P={prec:.4f} R={rec:.4f} F1={f1:.4f} Acc={acc:.4f}")

In [None]:
# use the model.evaluate function to report the loss and accuracy on the dev set
# 使用 model.evaluate 报告 dev 的 loss/accuracy
eval_loss, eval_acc = model.evaluate(X_dv_arr, np.array(dev_y, dtype='float32'), verbose=0)
print(f"[Dev] Keras evaluate -> loss={eval_loss:.4f} acc={eval_acc:.4f}")

# 模型可训练参数量（回答 Task 4 的提问）
print("Trainable params:", model.count_params())

<span style="color: red;">__Expected Behavior__ </span>

**Neural Networks**:
Neural networks initialize their weights randomly and learn through iterative stochastic optimization, which introduces non-determinism by design. Even with the same data and parameters, different runs may lead to slightly different learned weights and therefore different graphs. In this case, variation between your three runs is expected and desired, it shows how the model’s training process can vary due to randomness.

<span style="color: red;">__Note on Training Data Increments__ </span>

When varying the amount of training data, choose increments that are meaningful and reasonable, you should be able to observe clear trends without making the experiment unnecessarily long. You may increment the training data percentage by **5%**, **10%** or **20%**.

**Make sure that one of your experiments includes 10% of the training data, as you will need this result to answer a question in Task 5.**

In [2]:
# create the same graph as with NB and LR, with your neural network model instead!
# make sure to re-create your model each time you train it — you don't want to start with
# an already trained network!

# you should experiment with different numbers of epochs to see how performance varies
# you need not create an experiment that takes > 10 min to run (gradescope will run out of computing resources and give you a 0)
# 生成与 NB/LR 相同的 4 指标曲线图
import matplotlib.pyplot as plt

def train_eval_nn_for_percent(percent: int,
                              use_count_vectorizer: bool,
                              binary_feats: bool,
                              epochs: int,
                              batch_size: int,
                              run_id: int = 1):
    """
    给定训练比例 -> 训练并在 dev 上返回 (P,R,F1,Acc)
    注意每次都重建模型，避免“续训”污染。
    """
    sub_X, sub_y = sutils.take_percent(train_X, train_y, pct=percent, shuffle=True, seed=run_id)
    X_tr, X_dv, vec = vectorize(use_count_vectorizer=use_count_vectorizer, binary=binary_feats, sub_X=sub_X)
    X_tr_arr = X_tr.astype('float32').toarray()
    X_dv_arr = X_dv.astype('float32').toarray()
    y_tr_arr = np.array(sub_y, dtype='float32')

    model = build_model(input_dim=X_tr_arr.shape[1], hidden_units=128, hidden_layers=1, dropout=0.2)
    model.fit(X_tr_arr, y_tr_arr, epochs=epochs, batch_size=batch_size, verbose=0)

    probs = model.predict(X_dv_arr, verbose=0).reshape(-1)
    preds = (probs >= 0.5).astype(int).tolist()
    return sutils.get_prfa(dev_y, preds, verbose=False)  # (P,R,F1,Acc)

# —— 设置实验组合（可改）
USE_COUNTVECT = True   # True=CountVectorizer；False=自定义
BINARY_FEATS  = False  # False=multinomial；True=binarized
EPOCHS        = 8
BATCH_SIZE    = 32
RUN_ID        = 1      # 记得做 1/2/3 三次

percents = [10, 20, 40, 60, 80, 100]  # 含 10%，满足 Task 5 要求
title = f"Neural Net ({'CountVectorizer' if USE_COUNTVECT else 'Custom'} | {'Binarized' if BINARY_FEATS else 'Multinomial'}) — Run {RUN_ID}"
savepath = f"NeuralNet_{'cv' if USE_COUNTVECT else 'custom'}_{'bin' if BINARY_FEATS else 'multi'}_run{RUN_ID}.png"

sutils.create_training_graph(
    metrics_fun=lambda p: train_eval_nn_for_percent(
        p, use_count_vectorizer=USE_COUNTVECT, binary_feats=BINARY_FEATS,
        epochs=EPOCHS, batch_size=BATCH_SIZE, run_id=RUN_ID
    ),
    percents=percents,
    title=title,
    savepath=savepath,
    verbose=False
)
print("Saved:", savepath)



Report the f1 scores for your model with the following settings, using the same number of epochs to train in both cases:
- number of epochs used: __YOUR ANSWER HERE__
- multinomial features: __YOUR ANSWER HERE__ 
- binarized features: __YOUR ANSWER HERE__

In [18]:
# -*- coding: utf-8 -*-
# Task 4: Feedforward Neural Network on BoW
import numpy as np
import sentiment_utils as sutils

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

# from keras.models import Sequential
from tensorflow.keras.models import Sequential
# from keras.layers import Dense, Dropout
from tensorflow.keras.layers import Dense, Dropout
# from keras.optimizers import SGD
from tensorflow.keras.optimizers import SGD

TRAIN_FILE = "movie_reviews_train.txt"
DEV_FILE   = "movie_reviews_dev.txt"

train_X, train_y = sutils.generate_tuples_from_file(TRAIN_FILE)
dev_X,   dev_y   = sutils.generate_tuples_from_file(DEV_FILE)

vocab = sutils.create_index(train_X, min_freq=1)

def vectorize(use_cv: bool, binary: bool, sub_X=None):
    if sub_X is None:
        sub_X = train_X
    if use_cv:
        docs_train = [" ".join(t) for t in sub_X]
        vec = CountVectorizer(binary=binary, vocabulary=None)
        Xtr = vec.fit_transform(docs_train)
        vec = CountVectorizer(binary=binary, vocabulary=vec.vocabulary_)
        Xdv = vec.transform([" ".join(t) for t in dev_X])
        return Xtr, Xdv, vec
    else:
        feats_tr = sutils.featurize(vocab, sub_X, binary=binary)
        feats_dv = sutils.featurize(vocab, dev_X, binary=binary)
        dv = DictVectorizer(sparse=True)
        Xtr = dv.fit_transform(feats_tr)
        Xdv = dv.transform(feats_dv)
        return Xtr, Xdv, dv

def build_model(input_dim: int, hidden_units: int = 128, hidden_layers: int = 1, dropout: float = 0.2):
    model = Sequential()
    # First hidden
    model.add(Dense(hidden_units, activation='relu', input_shape=(input_dim,)))
    if dropout > 0:
        model.add(Dropout(dropout))
    for _ in range(hidden_layers - 1):
        model.add(Dense(hidden_units, activation='relu'))
        if dropout > 0:
            model.add(Dropout(dropout))
    # Output
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=SGD(learning_rate=0.05, momentum=0.9), metrics=['accuracy'])
    return model

def train_eval_nn(percent: int, use_cv: bool, binary: bool, epochs: int, batch_size: int, run_id: int = 1):
    sub_X, sub_y = sutils.take_percent(train_X, train_y, percent, shuffle=True, seed=run_id)
    Xtr, Xdv, vec = vectorize(use_cv=use_cv, binary=binary, sub_X=sub_X)

    input_dim = Xtr.shape[1]
    model = build_model(input_dim=input_dim, hidden_units=128, hidden_layers=1, dropout=0.2)
    # Keras expects dense arrays; convert sparse to dense if tiny, else to csr then toarray() carefully
    Xtr_arr = Xtr.astype('float32').toarray()
    Xdv_arr = Xdv.astype('float32').toarray()
    ytr = np.array(sub_y, dtype='float32')

    model.fit(Xtr_arr, ytr, epochs=epochs, batch_size=batch_size, verbose=0)
    # Evaluate & predict
    dev_probs = model.predict(Xdv_arr, verbose=0).reshape(-1)
    preds = (dev_probs >= 0.5).astype(int).tolist()
    prec, rec, f1, acc = sutils.get_prfa(dev_y, preds, verbose=False)
    return prec, rec, f1, acc, model.count_params()

def plot_runs_nn(use_cv: bool, binary: bool, epochs: int, batch_size: int, run_id: int, percents=None):
    if percents is None:
        percents = [10, 20, 40, 60, 80, 100]
    title = f"Neural Net ({'CountVectorizer' if use_cv else 'Custom'} | {'Binarized' if binary else 'Multinomial'}) — Run {run_id}"
    sutils.create_training_graph(
        metrics_fun=lambda p: train_eval_nn(p, use_cv=use_cv, binary=binary, epochs=epochs, batch_size=batch_size, run_id=run_id)[:4],
        percents=percents,
        title=title,
        savepath=f"NeuralNet_{'cv' if use_cv else 'custom'}_{'bin' if binary else 'multi'}_run{run_id}.png"
    )

if __name__ == "__main__":
    # Example default: multinomial with CountVectorizer
    epochs = 8
    batch = 32
    # Report parameter count once @100%
    _,_,f1,_, params = train_eval_nn(100, use_cv=True, binary=False, epochs=epochs, batch_size=batch, run_id=1)
    print(f"Model trainable parameters: {params}")
    print(f"F1 (epochs={epochs}) multinomial: {f1:.4f}")

    for run in (1,2,3):
        plot_runs_nn(use_cv=True, binary=False, epochs=epochs, batch_size=batch, run_id=run)


ModuleNotFoundError: No module named 'tensorflow.keras'