In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import jieba

# 加载并分词处理中文停用词
def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        stopwords = f.read().splitlines()
    return [" ".join(jieba.cut(word)) for word in stopwords]

# 加载预处理后的停用词
stopwords = load_stopwords('/hongyi/stream/stopwords/baidu_stopwords.txt')

# 使用 CountVectorizer 加载处理后的中文停用词
vectorizer = CountVectorizer(stop_words=stopwords)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.524 seconds.
Prefix dict has been built successfully.


In [4]:
stopwords

['--',
 '?',
 '“',
 '”',
 '》',
 '－ －',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain ' t",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren ' t",
 'around',
 'as',
 "a ' s",
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'came',
 'can',
 'cannot',
 'cant',
 "can ' t",
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 "c ' mon",
 'co

In [1]:
from stream_topic.models import KmeansTM
from stream_topic.utils import TMDataset

dataset = TMDataset()
dataset.fetch_dataset("BBC_News")
dataset.preprocess(model_type="KmeansTM")

model = KmeansTM("/hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/")
model.fit(dataset, n_topics=5)

topics = model.get_topics()
print(topics)

  from tqdm.autonotebook import tqdm, trange
[32m2024-11-21 09:37:04.164[0m | [1mINFO    [0m | [36mstream_topic.utils.dataset[0m:[36mfetch_dataset[0m:[36m120[0m - [1mFetching dataset: BBC_News[0m
[32m2024-11-21 09:45:01.386[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m334[0m - [1mDownloading dataset from github[0m
[32m2024-11-21 09:45:04.135[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m336[0m - [1mDataset downloaded successfully at ~/stream_topic_data/[0m
[32m2024-11-21 09:45:05.013[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m364[0m - [1mDownloading dataset info from github[0m
[32m2024-11-21 09:45:06.724[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m366[0m - [1mDataset info downloaded successfull

[['match', 'england', 'cup', 'champion', 'win', 'coach', 'team', 'injury', 'ireland', 'season'], ['labour', 'election', 'party', 'tory', 'blair', 'lord', 'minister', 'brown', 'prime', 'howard'], ['growth', 'bank', 'economy', 'oil', 'price', 'share', 'market', 'economic', 'china', 'rate'], ['award', 'film', 'actor', 'best', 'star', 'band', 'album', 'song', 'oscar', 'actress'], ['technology', 'mobile', 'phone', 'user', 'computer', 'digital', 'software', 'site', 'network', 'net']]


In [3]:
dataset.dataframe.text[10]

'ask jeeves third leading online firm week internet advertising firm revenue fourth quarter exceeding ask jeeves among relatively modest profit quarter announced google earlier week quarter online advertising taken relatively late last year marketing company doubleclick one leading online advertising warned business would sale thursday announced sharp brought increase profit ask jeeves doubleclick profit news however analyst fall quarter google'

In [2]:
from stream_topic.metrics import ISIM, INT, ISH,Expressivity, NPMI, Embedding_Coherence, Embedding_Topic_Diversity
from sentence_transformers import SentenceTransformer
from stream_topic.metrics.metrics_config import MetricsConfig
MetricsConfig.set_PARAPHRASE_embedder("/hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/")
MetricsConfig.set_SENTENCE_embedder("/hongyi/stream/sentence-transformers/all-MiniLM-L6-v2/")

In [5]:
import numpy as np
score_list=[]
metric = ISIM()
for i in range(100):    
    scores = metric.score(topics) #值越小越好
    score_list.append(scores)
print("ISIM scores:", np.mean(score_list))

Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
ISIM scores: 0.17864638064056637


In [8]:
metric.get_info()

AttributeError: 'INT' object has no attribute 'metric_embedder'

In [7]:
score_list=[]
metric = INT()
for i in range(100):    
    scores = metric.score(topics) #值越小越好
    score_list.append(scores)
print("INT scores:", scores)

Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
INT scores: 0.33999999999999997


In [14]:
score_list=[]
metric = ISH()
for i in range(100):    
    scores = metric.score(topics) #值越小越好
    score_list.append(scores)
print("ISH scores:", scores)

Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
ISH scores: 0.2354140043258667


In [16]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import gensim
from nltk.corpus import stopwords
GENSIM_STOPWORDS = gensim.parsing.preprocessing.STOPWORDS
NLTK_STOPWORDS = stopwords.words("english")
STOPWORDS = list(
    set(list(NLTK_STOPWORDS) + list(GENSIM_STOPWORDS) + list(ENGLISH_STOP_WORDS))
)

In [20]:
import numpy as np
# topics = [["apple", "banana", "cherry", "date", "fig","apple", "banana", "cherry", "date", "fig","apple", "banana", "cherry", "date", "fig"],
#           ["dog", "cat", "rabbit", "hamster", "gerbil","dog", "cat", "rabbit", "hamster", "gerbil","dog", "cat", "rabbit", "hamster", "gerbil"]]
expressivity_metric = Expressivity(
n_words=5,
custom_stopwords=STOPWORDS
)
beta = np.random.rand(5, 384)
info = expressivity_metric.get_info()
print("Metric Info:", info)
scores = expressivity_metric.score(topics, beta)  #值越小越好
print("Expressivity scores:", scores)

Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/all-MiniLM-L6-v2/
Metric Info: {'metric_name': 'Expressivity', 'n_words': 5, 'embedding_model_name': SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), 'metric_range': '0 to 1, smaller is better', 'description': 'The expressivity metric measures the distance between the mean vector of the top words in a topic and the mean vector of the embeddings of the stop words.'}
Expre

In [21]:
import numpy as np
beta = np.random.rand(5, 384)
diversity_metric = Embedding_Topic_Diversity()
info = diversity_metric.get_info()
print("Metric Info:", info)
scores = diversity_metric.score(topics, beta)  #值越小越好
print("Diversity score:", scores)

Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Loading model from local path: /hongyi/stream/sentence-transformers/paraphrase-MiniLM-L3-v2/
Metric Info: {'metric_name': 'Embedding Topic Diversity', 'n_words': 10, 'embedding_model_name': SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), 'metric_range': '0 to 1, smaller is better', 'description': 'The diversity metric measures the mean cosine similarity of the mean vectors of the top words of all topics.'}
Diversity score: 0.43775419298273144


In [22]:
metric = NPMI(dataset,language = "english") #值越大越好
metric.score(topics)

0.09477

In [45]:
metric.score_per_topic(topics)

{'england, rugby, ireland, france, coach': 0.16290000081062317,
 'tax, pension, council, brown, tory': 0.4258599877357483,
 'mobile, phone, technology, digital, music': 0.12892000377178192,
 'airline, country, aid, economic, budget': 0.3128199875354767,
 'band, album, song, music, rock': 0.46094000339508057,
 'series, celebrity, comedy, audience, viewer': 0.2819400131702423,
 'race, olympic, indoor, championship, holmes': 0.17599999904632568,
 'search, broadband, blog, google, net': 0.12077999860048294,
 'share, profit, shareholder, financial, executive': 0.22439000010490417,
 'chelsea, liverpool, club, league, arsenal': 0.12272000312805176,
 'yukos, oil, russian, gazprom, russia': 0.07117000222206116,
 'roddick, seed, open, match, nadal': 0.24886000156402588,
 'lord, police, law, hunting, trial': 0.4142799973487854,
 'labour, party, election, blair, prime': 0.1724500060081482,
 'rate, growth, economy, economist, dollar': 0.14369000494480133,
 'virus, software, program, email, security

In [11]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

model_output = {"topics": model.get_topics(), "topic-word-matrix": model.get_beta(), "topic-document-matrix": model.get_theta()}

metric = TopicDiversity(topk=10) # Initialize metric
topic_diversity_score = metric.score(model_output)


In [12]:
predictions = model.predict(dataset.texts)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2225/2225 [01:09<00:00, 31.94it/s]


In [14]:
from sklearn.metrics import confusion_matrix
label_mapping = {
    'business': 3,
    'entertainment': 2,
    'politics': 4,
    'sport': 1,
    'tech': 0
}

In [10]:
predictions

array([2, 2, 2, ..., 3, 3, 3], dtype=int32)

In [15]:
import pandas as pd
# 假设 dataset.labels 包含所有样本的真实标签
true_labels = dataset.labels

# 将汉字类别转换为数值类别
true_labels_numeric = [label_mapping[label] for label in true_labels]

# 将预测结果和真实标签转换为 DataFrame
df_predictions = pd.DataFrame(predictions, columns=['predicted_labels'])
df_true_labels = pd.DataFrame(true_labels_numeric, columns=['true_labels'])

# 计算混淆矩阵
conf_matrix = confusion_matrix(df_true_labels, df_predictions)

In [11]:
# 计算每个类别的准确率
accuracies = {}
for i in range(conf_matrix.shape[0]):
    accuracy = conf_matrix[i, i] / conf_matrix.sum(axis=1)[i]
    accuracies[f'Topic {i}'] = accuracy

# 打印每个类别的准确率
for topic, accuracy in accuracies.items():
    print(f'{topic} accuracy: {accuracy:.2f}')

Topic 0 accuracy: 0.00
Topic 1 accuracy: 0.99
Topic 2 accuracy: 0.01
Topic 3 accuracy: 0.03
Topic 4 accuracy: 0.92


In [16]:
# 如果你想要更详细的信息，可以打印整个混淆矩阵
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[  2   1   1   9 388]
 [  1 507   0   2   1]
 [ 14   1 348   2  21]
 [ 15   0   0 480  15]
 [384   4   1  26   2]]
