In [30]:
import weaviate
import os
import pandas as pd
import numpy as np

In [31]:
api_key = os.environ["OPENAI_API_KEY"]
weaviate_url = os.environ["WEAVIATE_URL"]

client = weaviate.Client(
    url=weaviate_url,  # Replace with your actual instance (e.g. url="http://localhost:8080/", or "https://your-endpoint.weaviate.network/")
    additional_headers={
        "X-OpenAI-API-Key": api_key
    }
)

In [4]:
# client.schema.delete_all()  # To wipe your instance clean

In [32]:
def jprint(json_in):
    import json
    print(json.dumps(json_in, indent=2))

jprint(client.get_meta())

{
  "hostname": "http://[::]:8080",
  "modules": {
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "name": "OpenAI Module"
    }
  },
  "version": "1.24.17"
}


In [33]:
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "Document",
      "description": "A document class to store documents used for knowledge base",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "The title of the document",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "title",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The entire content of the document",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "content",
          "tokenization": "w

In [34]:
def dataloader():
    jobsdata_path = "../data/all_nov_jobs.csv"
    df = pd.read_csv(jobsdata_path)
    df = df.fillna("None")  # Replace NaN values with "None"
    df.replace([np.inf, -np.inf], "None", inplace=True)  # Replace infinite values with "None"
    for _, row in df.iterrows():
        data_obj = {
            "title": row["title"],
            "company": row["company"],
            "company_link": row["company_link"],
            "place": row["place"],
            "date": row["date"],
            "apply_link": row["apply_link"],
            "post_link": row["post_link"],
            "seniority_level": row["seniority_level"],
            "employment_type": row["employmnet_type"],
            "description": row["description"],
            "job_title_id": row["job_title_id"],
            "job_desc_id": row["job_desc_id"],
        }
        yield data_obj

In [35]:
class_name = "all_nov_jobs"

In [36]:
class_obj = {
    "class": class_name,
    "vectorizer": "text2vec-openai",
}

client.schema.create_class(class_obj)

In [37]:
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "Document",
      "description": "A document class to store documents used for knowledge base",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "The title of the document",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "title",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The entire content of the document",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "content",
          "tokenization": "w

In [38]:
with client.batch as batch:  # Context manager manages batch flushing
    batch.batch_size=50
    batch.dynamic=True
    for data_obj in dataloader():
        batch.add_data_object(
            data_obj, 
            class_name
        )

In [39]:
jprint(data_obj)

{
  "title": "Software Engineer II - C/C++/OS",
  "company": "Microsoft",
  "company_link": "https://www.linkedin.com/company/1035/",
  "place": "Maine, United States Remote",
  "date": "2022-11-04",
  "apply_link": "https://careers.microsoft.com/us/en/job/1454432/Software-Engineer-II-C-C-OS?jobsource=linkedin&utm_source=linkedin&utm_medium=linkedin&utm_campaign=linkedin-feed",
  "post_link": "https://www.linkedin.com/jobs/view/3342681548/?eBP=JOB_SEARCH_ORGANIC&recommendedFlavor=IN_NETWORK&refId=Ww0RUXjRPh5ffnUwKmZ9VA%3D%3D&trackingId=vO8wOxIoXKrMhh4Ac9LkGQ%3D%3D&trk=flagship3_search_srp_jobs",
  "seniority_level": " Entry level",
  "employment_type": "Full-time ",
  "description": "Azure Edge + Platform brings together Edge platforms, devices, and services to deliver Edge solutions, operating systems, and engineering systems. Driven by its customers\u2019 needs, Azure Edge + Platform seeks to accelerate growth for Azure, E&D, and Microsoft\u2019s customers worldwide.\u202f\u202f\n\n\

In [40]:
jprint(client.query.aggregate(class_name).with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "All_nov_jobs": [
        {
          "meta": {
            "count": 16838
          }
        }
      ]
    }
  }
}


In [41]:
results = client.query.get(
    class_name, ["title",  "place", "description"]
).with_near_text(
    {"concepts": ["Generative AI Engineer with 2+ years of experience in ML, NLP, and software development. Proficient in fine-tuning LLMs, developing RAG systems, and prompt engineering. Skilled in Python, SQL, React, vector databases, and ML frameworks like TensorFlow and PyTorch. Strong background in full-stack development and MLOps. Adept at integrating ML/LLM models into web apps and optimizing pipelines for performance and efficiency."]}
).with_additional(
    ["distance", "id"]
).with_limit(5).do()
jprint(results)

{
  "data": {
    "Get": {
      "All_nov_jobs": [
        {
          "_additional": {
            "distance": 0.135472,
            "id": "e1602038-a232-4267-aaa5-4158fa3e4bef"
          },
          "description": "Description\n\n\n\n\nDevelopment and maintenance of software applications in the field of Natural Language Processing (NLP), Machine Learning (ML) and/or Artificial Intelligence (AI).\nTraining of custom machine learning / deep learning models based on structured and unstructured data.\nSelecting features, building and optimizing classifiers using machine learning techniques.\nFollow studies and developments aiming at improving the quality of machine translation (MT) engines for each installed language pair.\nInteract with data stewards and other IT stakeholders to define the data rules.\nCreating automated anomaly detection systems and constant tracking of its performance.\nData mining using state-of-the-art methods.\nProcessing, cleansing, and verifying the integrity of