[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)
[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)

# Quickstart


**Note**: Make sure the `nltk` dependencies are installed. If not, please run the following command:
```python
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
```

In [1]:
# uncomment the below line if running in Colab
# package neeeds to be installed for the notebook to run

# ! pip install -U stream_topic

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from stream_topic.models import CEDC
from stream_topic.utils import TMDataset

## CEDC model

In [4]:
dataset = TMDataset()
dataset.fetch_dataset("BBC_News")
dataset.preprocess(model_type="CEDC")

[32m2024-08-09 15:35:15.170[0m | [1mINFO    [0m | [36mstream_topic.utils.dataset[0m:[36mfetch_dataset[0m:[36m118[0m - [1mFetching dataset: BBC_News[0m
[32m2024-08-09 15:35:15.244[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m331[0m - [1mDownloading dataset from github[0m
[32m2024-08-09 15:35:15.518[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m333[0m - [1mDataset downloaded successfully at ~/stream_topic_data/[0m
[32m2024-08-09 15:35:15.663[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m361[0m - [1mDownloading dataset info from github[0m
[32m2024-08-09 15:35:15.795[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m363[0m - [1mDataset info downloaded successfully at ~/stream_topic_data/[0m
Preprocessing d

In [5]:
model = CEDC()
output = model.fit(dataset, n_topics=10)

[32m2024-08-09 15:35:27.056[0m | [1mINFO    [0m | [36mstream_topic.models.CEDC[0m:[36mfit[0m:[36m241[0m - [1m--- Training CEDC topic model ---[0m
[32m2024-08-09 15:35:27.122[0m | [1mINFO    [0m | [36mstream_topic.models.abstract_helper_models.base[0m:[36mprepare_embeddings[0m:[36m215[0m - [1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---[0m
[32m2024-08-09 15:35:27.191[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m302[0m - [1mDownloading embeddings from github[0m
[32m2024-08-09 15:35:27.416[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m304[0m - [1mEmbeddings  downloaded successfully at ~/stream_topic_data/[0m
[32m2024-08-09 15:35:27.423[0m | [1mINFO    [0m | [36mstream_topic.models.abstract_helper_models.base[0m:[36mdim_reduction[0m:[36m196[0m - [1m--- Reducing dimensions ---[0m
OMP: Info #276: omp

In [6]:
from stream_topic.visuals import visualize_topic_model

visualize_topic_model(
    model, 
    reduce_first=True, 
    port=8052,
    )

## CTMNeg model

In [7]:
from stream_topic.models import CTMNeg
dataset = TMDataset()
dataset.fetch_dataset("BBC_News")
dataset.preprocess(model_type="CTMNeg")
model = CTMNeg(encoder_dim=64, dropout=0.3)
output = model.fit(dataset, n_topics=5, max_epochs=2)

[32m2024-08-09 15:35:45.415[0m | [1mINFO    [0m | [36mstream_topic.utils.dataset[0m:[36mfetch_dataset[0m:[36m118[0m - [1mFetching dataset: BBC_News[0m
[32m2024-08-09 15:35:45.492[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m331[0m - [1mDownloading dataset from github[0m
[32m2024-08-09 15:35:45.691[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m333[0m - [1mDataset downloaded successfully at ~/stream_topic_data/[0m
[32m2024-08-09 15:35:45.786[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m361[0m - [1mDownloading dataset info from github[0m
[32m2024-08-09 15:35:45.926[0m | [1mINFO    [0m | [36mstream_topic.utils.data_downloader[0m:[36mload_custom_dataset_from_url[0m:[36m363[0m - [1mDataset info downloaded successfully at ~/stream_topic_data/[0m
Preprocessing d

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[32m2024-08-09 15:35:59.005[0m | [1mINFO    [0m | [36mstream_topic.models.ctmneg[0m:[36mfit[0m:[36m473[0m - [1m--- Training completed successfully. ---[0m
