In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

In [None]:
news = pd.read_csv("OnlineNewsPopularity.csv")
news.head()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# 分词+去停用词函数，这边需要下载一个语料库
def text_segmentation(text):
    # Remove punctuation marks
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove stopwords
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    
    return " ".join(tokens)

testdf["Comment Seg"] = testdf["Comment (Actual)"].apply(text_segmentation)

In [None]:
# 读取新闻数据集并进行预处理
news = pd.read_csv('news.csv')
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(news['text'])

In [None]:
# 训练LDA模型
lda_model = LatentDirichletAllocation(n_components=10, 
                                      max_iter=10, 
                                      learning_method='online',
                                      doc_topic_prior=0.1,
                                      topic_word_prior=0.01,
                                      random_state=0)
lda_Z = lda_model.fit_transform(X)

# 输出主题下的关键词，根据主题词判定分类
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda_model.components_):
    print("Topic %d:" % (topic_idx))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

# 使用LDA主题模型对未见过的新闻进行主题分类
new_news = ["xxx", "yyy", "zzz"]
new_X = vectorizer.transform(new_news)
new_lda_Z = lda_model.transform(new_X)
print(new_lda_Z)