In [3]:
import weaviate
import json
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your Weaviate endpoint
    additional_headers = {
        "X-OpenAI-Api-Key": api_key
    }
)


            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [24]:
# create manuscript collection
class_obj = {
    "class": "Manuscript",
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-openai": {
          "model": "text-embedding-3-small",
          "type": "text",
        },
        "generative-openai": {}  # Ensure the `generative-openai` module is used for generative queries
    }
}

client.schema.delete_class("Manuscript")
client.schema.create_class(class_obj)

In [25]:
# split text into windows
def text_to_windows(text, window_size, overlap):
    char_length = len(text)
    windows = []
    start = 0
    while start < char_length:
        end = start + window_size
        if end >= char_length:
            windows.append(text[start:char_length])  # Append the last window
            break
        windows.append(text[start:end])
        start += (window_size - overlap)  # Move start up by window_size minus overlap
    return windows

window_size = 2048

overlap = 512

counter = 0

client.batch.configure(batch_size=100)  # Configure batch
# read json lines into database
with open("pubmed_author_manuscripts/small_pubmed_manuscripts.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        # parse the json from line
        data = json.loads(line)
        # extract text property
        text = data.get("text", "")

        # Split text into windows
        text_windows = text_to_windows(text, window_size, overlap)

        counter += 1
        with client.batch as batch:  # Initialize a batch process
            chunk_counter = 0
            for text_chunk in text_windows:
                chunk_counter += 1
                properties = {
                    "text": text_chunk,
                }
                batch.add_data_object(
                    data_object=properties,
                    class_name="Manuscript"
                )
                print(f"added line {counter}, chunk {chunk_counter}")

added line 1, chunk 1
added line 1, chunk 2
added line 1, chunk 3
added line 1, chunk 4
added line 1, chunk 5
added line 1, chunk 6
added line 1, chunk 7
added line 1, chunk 8
added line 1, chunk 9
added line 1, chunk 10
added line 1, chunk 11
added line 1, chunk 12
added line 1, chunk 13
added line 1, chunk 14
added line 1, chunk 15
added line 1, chunk 16
added line 1, chunk 17
added line 1, chunk 18
added line 1, chunk 19
added line 1, chunk 20
added line 2, chunk 1
added line 2, chunk 2
added line 2, chunk 3
added line 2, chunk 4
added line 2, chunk 5
added line 2, chunk 6
added line 2, chunk 7
added line 2, chunk 8
added line 2, chunk 9
added line 2, chunk 10
added line 2, chunk 11
added line 2, chunk 12
added line 2, chunk 13
added line 2, chunk 14
added line 2, chunk 15
added line 2, chunk 16
added line 2, chunk 17
added line 2, chunk 18
added line 2, chunk 19
added line 2, chunk 20
added line 2, chunk 21
added line 2, chunk 22
added line 2, chunk 23
added line 2, chunk 24
added 

In [34]:
# check total number of entries
query = f"""
    {{
      Aggregate {{
        {'Manuscript'} {{
          meta {{
            count
          }}
        }}
      }}
    }}
    """
result = client.query.raw(query)
print(result['data']['Aggregate']['Manuscript'][0]['meta']['count'])

3658


In [33]:
# test query
response = (
    client.query
    .get("Manuscript", ["text"])
    .with_near_text({
        "concepts": ["estradiol impact on women's fear"]
    })
    .with_limit(5)
    .with_additional(["distance", "certainty"])
    .do()
)

print(json.dumps(response["data"]["Get"]["Manuscript"]))

[{"_additional": {"certainty": 0.8103974461555481, "distance": 0.3792051}, "text": "heir contributions to fear extinction behavior.\n\nClinical translation\n\nThe present study is the first to examine the network-level effects of estradiol on fear extinction from extinction learning to extinction recall; as such, it provides critical insight into the brain networks underlying anxiety and fear-based disorders in women. Our data support the growing literature indicating that given the relationship between naturally fluctuating endogenous estradiol and fear extinction memory, the success of psychiatric treatment (efficacy, duration, efficiency) may depend on gonadal hormone status (Wegerer et al., 2014; Glover et al., 2015; Pineles et al., 2016). Based on these data, estradiol may strengthen the consolidation of extinction memory via modulation of the amygdalar nuclei by the IL. This has powerful implications for improved treatment targets and enhanced efficacy of treatments dependent on 