# Import and load dataset

If you have not installed the package and work inside the repository, use the following two lines of code to make the example word:

import sys
sys.path.append("..")

In [1]:
from stream.utils import TMDataset
from stream.models import NMFTM

dataset = TMDataset()
dataset.fetch_dataset("20NewsGroup")

  from .autonotebook import tqdm as notebook_tqdm
  _dash_comm = Comm(target_name="dash")


In [2]:
dataset.language

'en'

In [3]:
dataset.preprocess(model_type="NMF")

Preprocessing documents: 100%|██████████| 2500/2500 [00:15<00:00, 159.37it/s]


In [4]:
print(dataset.dataframe.iloc[0]["tokens"])

['anyone', 'car', 'car', 'anyone', 'car', 'car']


In [5]:
print(dataset.dataframe.iloc[0]["text"])

anyone car car anyone car car


# Train the model
If embeddings for the model have been created before, they will not be created again for faster computation

In [6]:
model = NMFTM()  
model.fit(dataset)  
topics = model.get_topics()

Index(['predictions', 'text'], dtype='object')
0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
Name: predictions, dtype: object




In [7]:
print(topics)

{0: [('like', 0.01778463406770215), ('think', 0.012829159567616088), ('said', 0.012061456572159825), ('time', 0.011975083662851407), ('say', 0.011244012761108077), ('karina', 0.011057991924920883), ('know', 0.01037644101572665), ('patient', 0.010291494449921423), ('really', 0.009794810358940963), ('people', 0.009727681240959353)], 1: [('gun', 0.015370704284626373), ('right', 0.012566082217764634), ('people', 0.012237137369128424), ('law', 0.011830632323819163), ('state', 0.010004811114279154), ('government', 0.009861724786197427), ('weapon', 0.009098744673566384), ('think', 0.009042216355081708), ('child', 0.009020639481129048), ('moral', 0.008806606930614712)], 2: [('car', 0.08547279137816774), ('engine', 0.026084546528355568), ('auto', 0.023390567847810746), ('speed', 0.021905926345329972), ('mustang', 0.021448376516026407), ('model', 0.020022628479791194), ('insurance', 0.017987015673823905), ('rec', 0.017855635847995006), ('mph', 0.017546260554605585), ('driving', 0.016475490161933

# Evluate your model. 
Use all metrics available either in octis or the ExpandedTM metrics, ISIM, INT, Expressivity, Embedding_Coherence, Embedding_Topic_Diversity and classical NPMI

In [8]:
from stream.metrics import NPMI
metric = NPMI(dataset)

In [9]:
topics = model.get_topics()
score = metric.score(topics)
print(score)

-1.0


In [10]:
model.predict(dataset.texts)

array([ 2,  4, 12, ..., 11,  9,  0], dtype=int64)

# Visualize your fit model
Use a port that is not already in use. default is 8050

In [11]:
from stream.visuals import visualize_topic_model, visualize_topics
visualize_topic_model(model, dataset=dataset, port=8051)