In [1]:
import os
from datetime import datetime
from typing import List
from streaming_pipeline.models import WikipediaArticle

from streaming_pipeline import initialize

from streaming_pipeline.wikipedia_stream import WikipediaArticlesStreamSource

initialize()

w = WikipediaArticlesStreamSource(title_prefixes=["A", "B"])
print(w.next())


2024-05-23 01:21:03,586 - INFO - Initializing env vars...
2024-05-23 01:21:03,586 - INFO - Loading environment variables from: .env


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


{'id': '1', 'url': 'https://simple.wikipedia.org/wiki/April', 'title': 'April', 'text': 'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn commo

In [2]:
from pydantic import parse_obj_as


messages = [w.next() for _ in range(10)]
print(messages[0])


{'id': '2', 'url': 'https://simple.wikipedia.org/wiki/August', 'title': 'August', 'text': 'August (Aug.) is the eighth month of the year in the Gregorian calendar, coming between July and September. It has 31 days. It is named after the Roman emperor Augustus Caesar.\n\nAugust does not begin on the same day of the week as any other month in common years, but begins on the same day of the week as February in leap years. August always ends on the same day of the week as November.\n\nThe Month \n\nThis month was first called Sextilis in Latin, because it was the sixth month in the old Roman calendar. The Roman calendar began in March about 735\xa0BC with Romulus. October was the eighth month. August was the eighth month when January or February were added to the start of the year by King Numa Pompilius about 700\xa0BC. Or, when those two months were moved from the end to the beginning of the year by the decemvirs about 450\xa0BC (Roman writers disagree). In 153 BC January 1 was determined

In [3]:
documents = parse_obj_as(List[WikipediaArticle], messages)

for doc in documents:
    doc.transform()

In [4]:
doc = documents[0]
len(doc.doc_text), len(doc.doc_chunks), doc.doc_text

(2,
 0,
 ['August',
  'August (Aug.) is the eighth month of the year in the Gregorian calendar, coming between July and September. It has 31 days. It is named after the Roman emperor Augustus Caesar.\n\nAugust does not begin on the same day of the week as any other month in common years, but begins on the same day of the week as February in leap years. August always ends on the same day of the week as November.\n\nThe Month \n\nThis month was first called Sextilis in Latin, because it was the sixth month in the old Roman calendar. The Roman calendar began in March about 735BC with Romulus. October was the eighth month. August was the eighth month when January or February were added to the start of the year by King Numa Pompilius about 700BC. Or, when those two months were moved from the end to the beginning of the year by the decemvirs about 450BC (Roman writers disagree). In 153 BC January 1 was determined as the beginning of the year.\n\nAugust is named for Augustus Caesar who became

In [5]:
from streaming_pipeline.embeddings import EmbeddingModelSingleton


model = EmbeddingModelSingleton(cache_dir=None)

doc.compute_chunks(model)
len(doc.doc_text), len(doc.doc_chunks)



item: August has 1 chunks
item: August (Au has 5 chunks


(2, 6)

In [6]:
len(doc.doc_chunks[0]), len(doc.doc_chunks[1])

(6, 1761)

In [7]:
doc.compute_embeddings(model)
len(doc.doc_embeddings[0]), len(doc.doc_embeddings[1])

(384, 384)

In [8]:


ids, payloads = doc.to_payloads()
ids

['41ba70891fb6f39327d8ccb9b1dafb84',
 '045bd6595b39c583bd9e78852862383c',
 '301a490d72a3a84964d93e65b4b2c08d',
 '0d1d3b80927dc0854f3575a07febedac',
 '5cbce73d4a43a1b959087ec1496d222a',
 '7416149702deea4927f01279676fbfcf']

In [9]:
doc.doc_chunks[0], doc.doc_chunks[1]

('August',
 "August (Aug.) is the eighth month of the year in the Gregorian calendar, coming between July and September. It has 31 days. It is named after the Roman emperor Augustus Caesar.\n\nAugust does not begin on the same day of the week as any other month in common years, but begins on the same day of the week as February in leap years. August always ends on the same day of the week as November.\n\nThe Month \n\nThis month was first called Sextilis in Latin, because it was the sixth month in the old Roman calendar. The Roman calendar began in March about 735BC with Romulus. October was the eighth month. August was the eighth month when January or February were added to the start of the year by King Numa Pompilius about 700BC. Or, when those two months were moved from the end to the beginning of the year by the decemvirs about 450BC (Roman writers disagree). In 153 BC January 1 was determined as the beginning of the year.\n\nAugust is named for Augustus Caesar who became Roman con

In [10]:
payloads

[{'title': 'August',
  'url': 'https://simple.wikipedia.org/wiki/August',
  'text': 'August'},
 {'title': 'August',
  'url': 'https://simple.wikipedia.org/wiki/August',
  'text': "August (Aug.) is the eighth month of the year in the Gregorian calendar, coming between July and September. It has 31 days. It is named after the Roman emperor Augustus Caesar.\n\nAugust does not begin on the same day of the week as any other month in common years, but begins on the same day of the week as February in leap years. August always ends on the same day of the week as November.\n\nThe Month \n\nThis month was first called Sextilis in Latin, because it was the sixth month in the old Roman calendar. The Roman calendar began in March about 735BC with Romulus. October was the eighth month. August was the eighth month when January or February were added to the start of the year by King Numa Pompilius about 700BC. Or, when those two months were moved from the end to the beginning of the year by the decem

In [11]:
from qdrant_client.models import PointStruct

PointStruct?

[0;31mInit signature:[0m
[0mPointStruct[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mid[0m[0;34m:[0m [0mForwardRef[0m[0;34m([0m[0;34m'ExtendedPointId'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvector[0m[0;34m:[0m [0mForwardRef[0m[0;34m([0m[0;34m'VectorStruct'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpayload[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mInit docstring:[0m
Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.
[0;31mFile:[0m           ~/.cache/pypoetry/virtualenvs/streaming-pipeline-fcAEtgtX-py3.10/lib/python3.10/site-packages/qdrant_client/htt

In [12]:
points = [
    PointStruct(id=idx, vector=vector, payload=_payload)
    for idx, vector, _payload in zip(ids, doc.doc_embeddings, payloads)
]

points[0]

PointStruct(id='41ba70891fb6f39327d8ccb9b1dafb84', vector=[0.013589659705758095, 0.04040486738085747, 0.1208416223526001, 0.04845317825675011, -0.02448905259370804, -0.14554302394390106, 0.08605343103408813, -0.21176566183567047, -0.05545730143785477, -0.1702665090560913, 0.1890988051891327, 0.01176438108086586, -0.060689859092235565, -0.292618066072464, -0.21539917588233948, 0.1633121520280838, 0.03325789049267769, 0.006001024041324854, -0.079757921397686, -0.07979514449834824, -0.3036176264286041, -0.4202597737312317, -0.28942328691482544, -0.14775525033473969, -0.08228223025798798, 0.37307921051979065, -0.12502269446849823, 0.10064108669757843, -0.14801974594593048, -0.7686576843261719, -0.041444212198257446, -0.1987249106168747, -0.05472305789589882, -0.4456913471221924, 0.14495524764060974, 0.27766281366348267, -0.10336945205926895, -0.33234643936157227, -0.06951180100440979, 0.13542895019054413, -0.1583510786294937, -0.2899067997932434, 0.19585156440734863, -0.026067811995744705,