In [1]:
# Configuration dictionary
WEAVIATE_PROD_CONF = {
  "http_host": "weaviate-new.eu-more-europa-gpu.quinten.io",
  "http_port": 443,
  "http_secure": True,
  "grpc_host": "10.23.3.69",
  "grpc_port": 3051,
  "grpc_secure": False,
  "skip_init_checks": True,
}

In [2]:
import weaviate
weaviate_client = weaviate.connect_to_custom(**WEAVIATE_PROD_CONF)


In [4]:
# python pretty print
from pprint import pprint

prod_collections = weaviate_client.collections.list_all()
# pprint(prod_collections)

In [6]:
import json
from datetime import datetime


# Custom function to handle datetime serialization
def serialize(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()  # Convert datetime to ISO format string
    raise TypeError(f"Type {type(obj)} not serializable")


# Access the Publication_v2 collection
publication_collection = weaviate_client.collections.get("Publication_v2")

# Fetch 5 objects
response = publication_collection.query.fetch_objects(limit=15) # , return_properties=['object_id', 'outcome_measure', 'comparator', 'registry_related', 'geographical_area', 'intervention', 'publishing_date', 'population_follow_up', 'design_model_description', 'data_source_name', 'redirection_link', 'population_sex', 'journal', 'design_model', 'summary', 'keywords', 'chemicals', 'population_age_group', 'pmid', 'authors', 'retrieved_from', 'category', 'medical_condition', 'pdf_link', 'population_size', 'mesh_terms', 'population_description'])

# Print the results
for obj in response.objects:
    print(json.dumps(obj.properties, indent=2, default=serialize))


{
  "object_id": "00003009-e600-5ef0-9317-79db805dd458",
  "outcome_measure": "Admission hyperglycemia, stroke subtypes, and early vascular outcomes (3-month stroke, all-cause mortality, and composite of stroke, myocardial infarction, and all-cause mortality)",
  "comparator": "Not specified",
  "registry_related": true,
  "geographical_area": [
    "Not specified"
  ],
  "intervention": "Not specified",
  "publishing_date": "2023-02-01T00:00:00+00:00",
  "abstract": "Whether admission hyperglycemia is differentially associated with early vascular outcomes in acute ischemic stroke (AIS) depending on stroke subtype has been incompletely delineated. In a multicenter, prospective stroke registry, patients with AIS were categorized based on admission glucose levels into normoglycemia, moderate hyperglycemia, and severe hyperglycemia (<140mg/dl, 140-179mg/dl, and\u00a0\u2265180mg/dl, respectively) groups. Multivariate analysis assessed the interaction between the hyperglycemia and ischemic 

In [10]:
obj.properties["registry_related"]

True

In [9]:
from weaviate.classes.query import Filter

publication_collection.iterator(
    filters=Filter.by_property("registry_related").equal(False)
)

TypeError: Collection.iterator() got an unexpected keyword argument 'filters'

In [None]:
from weaviate.classes.query import Filter

collection = client.collections.get("YourCollectionName")
response = collection.query.fetch_objects(
    filters=Filter.by_property("your_field_name").equal("your_value"),
    limit=5
)

for o in response.objects:
    print(o.properties)  # Inspect returned objects

In [5]:
obj.properties.keys()

dict_keys(['object_id', 'medical_condition', 'design_model', 'authors', 'redirection_link', 'summary', 'journal', 'keywords', 'data_source_name', 'outcome_measure', 'comparator', 'chemicals', 'registry_related', 'pmid', 'population_age_group', 'geographical_area', 'retrieved_from', 'intervention', 'category', 'pdf_link', 'population_description', 'design_model_description', 'population_size', 'mesh_terms', 'population_sex', 'publishing_date', 'population_follow_up'])

In [6]:
print(response.total_count)

217736


In [None]:
from tqdm import tqdm

total_count = publication_collection.aggregate.over_all(total_count=True).total_count

pbar = tqdm(total=total_count)

uuids = []

for item in publication_collection.iterator():
    uuids.append(item.uuid)
    pbar.update(1)


  0%|          | 101/217736 [00:00<03:47, 957.42it/s]

100%|█████████▉| 217601/217736 [01:51<00:00, 2366.86it/s]

100%|██████████| 217736/217736 [02:10<00:00, 2366.86it/s]

In [15]:
import random

random_uuid = random.sample(uuids, 300)


In [20]:
dir(Filter.by_id())

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_property',
 '_target',
 '_target_path',
 'contains_any',
 'equal',
 'not_equal']

In [25]:
from weaviate.classes.query import Filter

response = publication_collection.query.fetch_objects(
    filters=Filter.by_id().contains_any(random_uuid)
)
# dict_objects = [serialize(obj.properties) for obj in response.objects]
# save as jsonl
import io
with io.open("random_publications.jsonl", "w", encoding="utf-8") as f:
    for obj in response.objects:
        f.write(json.dumps(obj.properties, default=serialize) + "\n")



In [23]:
obj.properties

{'object_id': 'df3b4d1f-c532-5836-a89f-83ea9e5411e9',
 'outcome_measure': 'Diagnostic performance of axilla ultrasound (US), breast shear wave elastography (SWE), and their combination in detecting residual metastasis in axillary level III after NAT',
 'comparator': 'Not specified',
 'registry_related': True,
 'geographical_area': ['Not specified'],
 'intervention': 'Not specified',
 'publishing_date': datetime.datetime(2024, 11, 20, 0, 0, tzinfo=datetime.timezone.utc),
 'abstract': "To develop a combined approach using shear wave elastography (SWE) and conventional ultrasound (US) to determine the extent of positive axillary lymph nodes (LNs) following neoadjuvant therapy (NAT) in breast cancer patients with nodal involvement. This prospective, multicenter study was registered on the Chinese Clinical Trial Registry (ChiCTR2400085035). From October 2018 to February 2024, a total of 303 breast cancer patients with biopsy-proven positive LN were enrolled. The conventional US features of 

In [17]:
detailed_response = publication_collection.query.fetch_objects(
    where={"id": {"$in": random_uuid}},
    return_properties=["pmid", "title", "abstract"]  # Add other properties if needed
)


TypeError: _FetchObjectsQueryAsync.fetch_objects() got an unexpected keyword argument 'where'

In [7]:
response = publication_collection.query.fetch_objects(return_properties=['object_id'])


In [10]:
len(response.objects)

1000

In [1]:
from sentence_transformers import SentenceTransformer


ModuleNotFoundError: No module named 'sentence_transformers'