In [3]:
import weaviate

client = weaviate.Client("http://localhost:8080")


In [4]:
client.is_ready()

True

In [7]:
import nltk # it is a dependency of newspaper3k
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/eason/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import newspaper
import uuid
import json
from tqdm import tqdm

def get_articles_from_newspaper(
        news_url: str, 
        max_articles: int=100
    ) -> None:
    """
    Download and save newspaper articles as weaviate schemas.
    Parameters
    ----------
    newspaper_url : str
        Newspaper title.
    """
    
    objects = []
    
    # Build the actual newspaper    
    news_builder = newspaper.build(news_url, memoize_articles=False)
    
    if max_articles > news_builder.size():
        max_articles = news_builder.size()
    pbar = tqdm(total=max_articles)
    pbar.set_description(f"{news_url}")
    i = 0
    while len(objects) < max_articles and i < news_builder.size():
        article = news_builder.articles[i]
        try:
            article.download()
            article.parse()
            article.nlp()

            if (article.title != '' and \
                article.title is not None and \
                article.summary != '' and \
                article.summary is not None and\
                article.authors):

                # create an UUID for the article using its URL
                article_id = uuid.uuid3(uuid.NAMESPACE_DNS, article.url)

                # create the object
                objects.append({
                    'id': str(article_id),
                    'title': article.title,
                    'summary': article.summary,
                    'authors': article.authors
                })
                
                pbar.update(1)

        except:
            # something went wrong with getting the article, ignore it
            pass
        i += 1
    pbar.close()
    return objects
data = []
data += get_articles_from_newspaper('https://www.theguardian.com/international')
data += get_articles_from_newspaper('http://cnn.com')

https://www.theguardian.com/international: 100%|████████████████████████| 100/100 [00:43<00:00,  2.32it/s]
http://cnn.com: 100%|███████████████████████████████████████████████████| 100/100 [03:41<00:00,  2.22s/it]


In [45]:
article_class_schema = {
    # name of the class
    "class": "Article",
    # a description of what this class represents
    "description": "An Article class to store the article summary and its authors",
    # class properties
    "properties": [
        {
            "name": "title",
            "dataType": ["string"],
            "description": "The title of the article", 
        },
        {
            "name": "summary",
            "dataType": ["text"],
            "description": "The summary of the article",
        },
        {
            "name": "hasAuthors",
            "dataType": ["Author"],
            "description": "The authors this article has",
        }
    ]
}

In [46]:
author_class_schema = {
    "class": "Author",
    "description": "An Author class to store the author information",
    "properties": [
        {
            "name": "name",
            "dataType": ["string"],
            "description": "The name of the author", 
        },
        {
            "name": "wroteArticles",
            "dataType": ["Article"],
            "description": "The articles of the author", 
        }
    ]
}

In [51]:
client.schema.delete_all()

In [52]:
client.schema.create({'classes': [
    author_class_schema, article_class_schema 
]})

In [53]:
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

In [56]:
data[0]

{'id': '32195ed2-252b-3bf4-8b9f-b0abe19537d6',
 'title': 'Maxwell prosecutors: ‘sexualized’ photo of young girl displayed outside Epstein bedroom',
 'summary': 'The prosecution also revealed that “sexually suggestive photograph of a very young girl” was displayed outside Epstein’s bedroom at his Palm Beach mansion.\nA green foldable massage table is displayed in court during the Ghislaine Maxwell trial.\nOne depicted a young girl pulling down her underwear, exposing her buttocks.\nAn image depicted a young girl on Epstein’s lap.\n“In order to get into that room, you have to get by a sexually suggestive photograph of a very young girl,” the prosecution said.',
 'authors': ['Victoria Bekiempis']}

In [57]:
article_object = {
    'title': data[0]['title'],
    'summary': data[0]['summary'].replace('\n', '') # remove newline character
    # we leave out the `hasAuthors` because it is a reference and will be created after we create the Authors
}
article_id = data[0]['id']

# validated the object
result = client.data_object.validate(
    data_object=article_object,
    class_name='Article',
    uuid=article_id
)

In [58]:
result

{'error': None, 'valid': True}

In [61]:
# create the object
client.data_object.create(
    data_object=article_object,
    class_name='Article',
    uuid=article_id # if not specified, weaviate is going to create an UUID for you.
)

UnexpectedStatusCodeException: Creating object! Unexpected status code: 500, with response body: {'error': [{'message': 'fail with status 500: CUDA error: no kernel image is available for execution on the device'}]}