In [1]:
import re

import pandas as pd
train = pd.read_csv( "data/labeledTrainData.tsv", delimiter="\t", quoting=3 )
test = pd.read_csv( "data/testData.tsv", delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "data/unlabeledTrainData.tsv",delimiter="\t", quoting=3 )

print(f"Read {train['review'].size} labeled train reviews, {test['review'].size} labeled test reviews, and {unlabeled_train['review'].size} unlabeled reviews")

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews


不移除停用词，学习更广泛的关系特征

In [23]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
def review_to_wordlist(review,remove_stopwords=False):
    # 移除 html标记，转换为纯文本
    review_text = BeautifulSoup(review, "lxml").text
    # 移除数字字符
    review_text = re.sub(r"[^a-zA-Z]", " ", review_text)
    # 不移除停用词
    # 转换成小写
    review_text = review_text.lower()
    words = review_text.split()
    # 停用词
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

word2vec期望输入的是单个句子，所以要将段落拆分成句子-如何判断句子结束？-nltk-punkt

In [3]:
import nltk.data
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /Users/wangyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/wangyu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [24]:
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences( review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    # review.strip()：先移除文本首尾的空白字符（如空格、换行符等），清理原始文本
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence,remove_stopwords))
    return sentences

In [5]:
sentences = []  # Initialize an empty list of sentences

for review in train['review']:
    sentences+=review_to_sentences( review, tokenizer )
# 如果用 append 会将函数返回的整个数组添加到 sentences 中[]
# +=则是将函数返回的数组中的内容添加到 sentences 中
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review, tokenizer)



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  review_text = BeautifulSoup(review, "lxml").text


In [6]:
len(sentences)

796172

In [7]:
sentences[0]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
num_features = 300    # 词向量维度
min_word_count = 40   # 最小词数，丢弃 min_count 以下词（节省内存、去噪）。
num_workers = 4       # 并行运行的线程数量
context = 10          # 上下文窗口大小
downsampling = 1e-3   # 高频词的下采样设置，对高频词按概率丢弃部分出现，减少“the / , / .”的干扰（sample 参数）。

from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)
# 用 negative sampling，假设抽到负样 dog, apple，目标是最大化：
# log σ(v_cat·v_say) + log σ(v_meow·v_say) + log σ(-v_dog·v_say) + log σ(-v_apple·v_say)

2025-10-18 09:45:25,550 : INFO : collecting all words and their counts
2025-10-18 09:45:25,551 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-10-18 09:45:25,567 : INFO : PROGRESS: at sentence #10000, processed 225664 words, keeping 17775 word types
2025-10-18 09:45:25,585 : INFO : PROGRESS: at sentence #20000, processed 451738 words, keeping 24945 word types
2025-10-18 09:45:25,602 : INFO : PROGRESS: at sentence #30000, processed 670859 words, keeping 30027 word types
2025-10-18 09:45:25,620 : INFO : PROGRESS: at sentence #40000, processed 896841 words, keeping 34335 word types
2025-10-18 09:45:25,638 : INFO : PROGRESS: at sentence #50000, processed 1116082 words, keeping 37751 word types
2025-10-18 09:45:25,655 : INFO : PROGRESS: at sentence #60000, processed 1337544 words, keeping 40711 word types
2025-10-18 09:45:25,672 : INFO : PROGRESS: at sentence #70000, processed 1560307 words, keeping 43311 word types
2025-10-18 09:45:25,689 : INFO : PROGRESS: 

In [9]:
# 对模型的词向量进行 归一化（normalization），并通过 replace=True 参数来 节省内存（覆盖原始向量)，但是已经过时
# 归一化让“相似度”真正代表语义，而不是受词频或向量长度干扰。
model.wv.fill_norms()  # 计算并缓存所有词向量的L2范数

In [10]:
model_name = "300features_40minwords_10context.model"
model.save(model_name)


2025-10-18 09:45:58,814 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-10-18T09:45:58.814170', 'gensim': '4.3.2', 'python': '3.10.18 (main, Jun  5 2025, 08:37:47) [Clang 14.0.6 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'saving'}
2025-10-18 09:45:58,814 : INFO : not storing attribute cum_table
2025-10-18 09:45:58,832 : INFO : saved 300features_40minwords_10context.model


In [11]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [12]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [13]:
model.wv.doesnt_match("paris berlin london austria".split())

'austria'

In [14]:
model.wv.most_similar("man")

[('woman', 0.5995623469352722),
 ('lady', 0.5918226838111877),
 ('lad', 0.5749083161354065),
 ('men', 0.5204932689666748),
 ('chap', 0.5144463181495667),
 ('guy', 0.5101121068000793),
 ('bloke', 0.507328450679779),
 ('doctor', 0.4993882477283478),
 ('farmer', 0.49911344051361084),
 ('monk', 0.49789494276046753)]

In [15]:
model.wv.most_similar("queen")

[('princess', 0.6741946339607239),
 ('bride', 0.6582250595092773),
 ('victoria', 0.6034635305404663),
 ('latifah', 0.603192925453186),
 ('showgirl', 0.5710378885269165),
 ('stepmother', 0.5692368745803833),
 ('maid', 0.5689412355422974),
 ('prince', 0.5601257085800171),
 ('mistress', 0.5599981546401978),
 ('belle', 0.5588557720184326)]

In [16]:
model.wv.most_similar("awful")

[('terrible', 0.760650634765625),
 ('horrible', 0.7319118976593018),
 ('atrocious', 0.7287877202033997),
 ('abysmal', 0.7120822668075562),
 ('dreadful', 0.6972820162773132),
 ('appalling', 0.6770805716514587),
 ('horrendous', 0.6694055199623108),
 ('horrid', 0.6627129912376404),
 ('lousy', 0.6256157159805298),
 ('amateurish', 0.5981056094169617)]

## part_3

In [17]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context.model")

2025-10-18 09:45:59,592 : INFO : loading Word2Vec object from 300features_40minwords_10context.model
2025-10-18 09:45:59,608 : INFO : loading wv recursively from 300features_40minwords_10context.model.wv.* with mmap=None
2025-10-18 09:45:59,612 : INFO : setting ignored attribute cum_table to None
2025-10-18 09:45:59,691 : INFO : Word2Vec lifecycle event {'fname': '300features_40minwords_10context.model', 'datetime': '2025-10-18T09:45:59.691445', 'gensim': '4.3.2', 'python': '3.10.18 (main, Jun  5 2025, 08:37:47) [Clang 14.0.6 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'loaded'}


In [18]:
model.wv.vectors

array([[-0.46203578, -1.1892741 ,  0.41406223, ...,  1.8321238 ,
        -0.32884738,  1.3248438 ],
       [-0.64237416, -0.9830256 ,  0.7323948 , ...,  0.28813797,
        -0.59826034, -0.01477962],
       [ 2.3623862 , -0.33042914, -0.33293054, ...,  0.11848633,
         0.7027185 ,  0.8033076 ],
       ...,
       [ 0.2277323 ,  0.17956766,  0.073678  , ..., -0.09059525,
         0.10255343, -0.00902587],
       [-0.03246737,  0.10530712, -0.09102537, ...,  0.1781935 ,
         0.01185587, -0.08071371],
       [-0.12659827,  0.0107701 , -0.17256647, ..., -0.18027577,
        -0.17464   , -0.10304833]], dtype=float32)

In [19]:
model.wv.vectors.shape

(16490, 300)

In [20]:
type(model.wv.vectors)

numpy.ndarray

In [22]:
model.wv["flower"]

array([ 2.45072499e-01,  2.62092233e-01,  1.96153913e-02,  2.21510842e-01,
       -3.40615883e-02,  2.07704619e-01, -2.44022463e-03,  3.67342085e-01,
        4.79519516e-01, -5.22090137e-01,  1.62662044e-01,  4.31865007e-01,
        1.58299997e-01, -8.22339579e-02, -2.61161238e-01,  5.76466545e-02,
       -2.00415507e-01, -5.80791295e-01, -4.38209385e-01, -1.05509900e-01,
        1.09054632e-01,  9.70642269e-02,  2.02213839e-01,  1.14739798e-01,
        4.44840193e-01, -4.28412156e-03,  1.56124271e-02, -1.40784860e-01,
       -1.11332096e-01,  9.98667181e-02,  4.90062475e-01, -5.52447028e-02,
       -1.60927176e-01, -3.06963831e-01, -2.41093040e-01, -2.55013764e-01,
        3.18381071e-01, -7.32264280e-01,  1.98628649e-01, -1.94106191e-01,
        1.91836730e-02,  1.97823420e-01,  3.75492662e-01,  2.20785379e-01,
       -3.21409702e-01, -1.26897497e-02,  2.29085654e-01,  4.98462379e-01,
        5.01539767e-01,  4.51061308e-01,  5.84751070e-01,  3.36822242e-01,
       -3.34707379e-01,  

In [33]:
import numpy as np
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0
    # 转换为集合提高速度
    index2word_set = set(model.wv.index_to_key)
    # 加和取平均
    for word in words:
        if word in index2word_set:
            nwords+=1
            featureVec = np.add(featureVec,model.wv[word])

    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    # 二维数组
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
       if counter%1000== 0:
           print(f"Review {counter} of {len(reviews)}")
       reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
       counter = counter + 1

    return reviewFeatureVecs

In [34]:

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 1900

In [35]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results
result = forest.predict( testDataVecs )

# Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
# 可以尝试tf-idf 加权

Fitting a random forest to labeled training data...


k-means聚类

In [37]:
from sklearn.cluster import KMeans
import time

start = time.time()

word_vectors = model.wv.vectors
num_clusters = int(word_vectors.shape[0] / 5)

kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  45.873730182647705 seconds.


In [38]:
word_centroid_map = dict(zip( model.wv.index_to_key, idx ))

In [40]:
# For the first 10 clusters
for cluster in range(0,10):
    print (f"Cluster {cluster}")
    words = []
    for word, cluster_id in word_centroid_map.items():
        if cluster_id == cluster:
            words.append(word)
    print(words)

Cluster 0
['ensues', 'courtroom', 'unexplained', 'resolved', 'abruptly', 'inexplicable', 'rapidly', 'unanswered', 'solved', 'complications', 'awry', 'ensue', 'unravel', 'arise', 'solving', 'occurring', 'unresolved']
Cluster 1
['headed', 'jerk', 'loser', 'drunken', 'wannabe', 'bumbling', 'macho', 'horny', 'nerd', 'mute', 'bully', 'nut', 'brat', 'token', 'geek', 'biker', 'pimp', 'redneck', 'slimy', 'thug', 'bald', 'puppy', 'midget', 'jock', 'bastard', 'hunk', 'nerdy', 'overweight', 'goth', 'bimbo', 'butch', 'frat', 'cheerleader', 'geeky']
Cluster 2
['precious', 'failing', 'limit', 'akin', 'ensure', 'enhance', 'qualify', 'sustain', 'warrant', 'salvage', 'compensate', 'increase', 'breathe', 'analyze', 'emphasize', 'retain', 'tackle', 'invest', 'recreate', 'imitate', 'elevate', 'expand', 'indulge', 'muster', 'digest', 'emulate', 'render', 'adjust', 'provoke', 'elicit', 'recapture', 'induce', 'extend', 'undermine', 'absorb', 'reduce', 'replicate']
Cluster 3
['demand', 'grip', 'click', 'table

In [41]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    # 聚类的数量等于词/质心映射中最大的聚类索引
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # 预先分配空间
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )

    # 遍历评论中的词语。如果该词语在词汇表中，找到它所属的聚类，并将该聚类的计数加一
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    return bag_of_centroids

In [42]:
train_centroids = np.zeros( (train["review"].size, num_clusters),dtype="float32" )

counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

# Repeat for test reviews
test_centroids = np.zeros(( test["review"].size, num_clusters),dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

In [43]:
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


Paragraph Vector is better 向量平均和聚类会丢失词序，而段落向量则保留了词序信息。