# Import and load dataset

If you have not installed the package and work inside the repository, use the following two lines of code to make the example word:

import sys
sys.path.append("..")

In [1]:
from stream.utils import TMDataset
from stream.models import KmeansTM

dataset = TMDataset()
dataset.fetch_dataset("20NewsGroup")

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()
  _dash_comm = Comm(target_name="dash")


In [2]:
dataset.language

'en'

In [3]:
dataset.preprocess(model_type="KmeansTM")

Preprocessing documents:   0%|          | 0/2500 [00:00<?, ?it/s]

Preprocessing documents: 100%|██████████| 2500/2500 [00:39<00:00, 64.01it/s]


In [4]:
print(dataset.dataframe.iloc[0]["tokens"])

['I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'I', 'saw', 'the', 'other', 'day.', 'It', 'was', 'a', '2door', 'sports', 'car,', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'doors', 'were', 'really', 'small.', 'In', 'addition,', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body.', 'This', 'is', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'tellme', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'of', 'production,', 'where', 'this', 'car', 'is', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car,', 'please', 'email.']


In [5]:
print(dataset.dataframe.iloc[0]["text"])

wondering anyone could enlighten car saw day door car looked late early called bricklin door really small addition front bumper separate rest body know anyone tellme model name engine production car made history whatever info funky looking car please email


# Train the model
If embeddings for the model have been created before, they will not be created again for faster computation

In [6]:
model = KmeansTM()  
model.fit(dataset)  
topics = model.get_topics()

[32m2024-06-23 11:00:34.330[0m | [1mINFO    [0m | [36mstream.models.KmeansTM[0m:[36mfit[0m:[36m262[0m - [1m--- Training KmeansTM topic model ---[0m
[32m2024-06-23 11:00:34.338[0m | [1mINFO    [0m | [36mstream.models.KmeansTM[0m:[36m_prepare_embeddings[0m:[36m161[0m - [1m--- Loading pre-computed paraphrase-MiniLM-L3-v2 embeddings ---[0m
[32m2024-06-23 11:00:34.342[0m | [1mINFO    [0m | [36mstream.models.KmeansTM[0m:[36m_dim_reduction[0m:[36m194[0m - [1m--- Reducing dimensions ---[0m
[32m2024-06-23 11:00:46.663[0m | [1mINFO    [0m | [36mstream.models.KmeansTM[0m:[36m_clustering[0m:[36m214[0m - [1m--- Creating document cluster ---[0m
[32m2024-06-23 11:00:47.279[0m | [1mINFO    [0m | [36mstream.models.KmeansTM[0m:[36mfit[0m:[36m295[0m - [1m--- Training completed successfully. ---[0m


In [7]:
print(topics)

[['printer', 'font', 'truetype', 'laser', 'postscript', 'window', 'color', 'print', 'image', 'atm'], ['gun', 'weapon', 'fbi', 'koresh', 'crime', 'firearm', 'child', 'atf', 'batf', 'kid'], ['max', 'b8f', 'a86', '145', '0t', 'pl', '1d9', '1t', 'giz', '2di'], ['game', 'team', 'player', 'play', 'hockey', 'goal', 'season', 'playoff', 'period', 'leaf'], ['health', 'patient', 'tobacco', 'doctor', 'food', 'medical', 'disease', 'msg', 'treatment', 'infection'], ['car', 'bike', 'engine', 'mile', 'speed', 'mph', 'mustang', 'wheel', 'tire', 'diesel'], ['moral', 'objective', 'morality', 'homosexual', 'sex', 'gay', 'men', 'murder', 'sexual', 'immoral'], ['circuit', 'mouse', 'input', 'digital', 'output', 'audio', 'signal', 'monitor', 'screen', 'amp'], ['space', 'nasa', 'shuttle', 'mission', 'orbit', 'satellite', 'launch', 'earth', 'astronaut', 'solar'], ['file', 'window', 'widget', 'image', 'program', 'motif', 'available', 'server', 'set', 'version'], ['key', 'encryption', 'clipper', 'chip', 'governm

# Evluate your model. 
Use all metrics available either in octis or the ExpandedTM metrics, ISIM, INT, Expressivity, Embedding_Coherence, Embedding_Topic_Diversity and classical NPMI

In [8]:
from stream.metrics import NPMI
metric = NPMI(dataset)

In [9]:
topics = model.get_topics()
score = metric.score(topics)
print(score)

-0.1961


# Visualize your fit model
Use a port that is not already in use. default is 8050

In [None]:
from stream.visuals import visualize_topic_model, visualize_topics
visualize_topic_model(model, dataset=dataset, port=8051)

In [None]:
visualize_topics(model, port=8052)