In [None]:
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
# 假设有一些商品评论数据，X 是评论文本，y 是对应的类别（好评/差评）
X = [
    "这个商品非常好，值得推荐！", 
    "质量很差，不值得购买。", 
    "很喜欢这个商品，已经买了好几次了。"
]
y = [
    "好评",
    "差评",
    "好评"
]

In [None]:
# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# 构建词向量模型
sentences = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

In [None]:
# 计算文本向量
def average_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_train_vec = np.array([average_vector(text.split(), word2vec_model) for text in X_train])
X_test_vec = np.array([average_vector(text.split(), word2vec_model) for text in X_test])

In [None]:
# 训练分类器
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

In [None]:
# 预测测试集
y_pred = clf.predict(X_test_vec)

In [None]:
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("准确率:", accuracy)

In [None]:
# 预测新的评论
new_comments = ["这个商品真的很不错！", "太差了，不会再买了。"]
new_comments_vec = np.array([average_vector(text.split(), word2vec_model) for text in new_comments])
predicted_labels = clf.predict(new_comments_vec)
print("新评论的预测类别:", predicted_labels)