In [1]:
import weaviate
import os
import pandas as pd
import numpy as np

In [2]:
api_key = os.environ["OPENAI_API_KEY"]
weaviate_url = os.environ["WEAVIATE_URL"]

client = weaviate.Client(
    url=weaviate_url,  # Replace with your actual instance (e.g. url="http://localhost:8080/", or "https://your-endpoint.weaviate.network/")
    additional_headers={
        "X-OpenAI-API-Key": api_key
    }
)

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [3]:
# client.schema.delete_all()  # To wipe your instance clean

In [4]:
def jprint(json_in):
    import json
    print(json.dumps(json_in, indent=2))

jprint(client.get_meta())

{
  "hostname": "http://[::]:8080",
  "modules": {
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "name": "OpenAI Module"
    }
  },
  "version": "1.23.0"
}


In [5]:
jprint(client.schema.get())

{
  "classes": []
}


In [6]:
def dataloader():
    jobsdata_path = "../data/all_nov_jobs.csv"
    df = pd.read_csv(jobsdata_path)
    df = df.fillna("None")  # Replace NaN values with "None"
    df.replace([np.inf, -np.inf], "None", inplace=True)  # Replace infinite values with "None"
    for _, row in df.iterrows():
        data_obj = {
            "title": row["title"],
            "company": row["company"],
            "company_link": row["company_link"],
            "place": row["place"],
            "date": row["date"],
            "apply_link": row["apply_link"],
            "post_link": row["post_link"],
            "seniority_level": row["seniority_level"],
            "employment_type": row["employmnet_type"],
            "description": row["description"],
            "job_title_id": row["job_title_id"],
            "job_desc_id": row["job_desc_id"],
        }
        yield data_obj

In [7]:
class_name = "all_nov_jobs"

In [8]:
class_obj = {
    "class": class_name,
    "vectorizer": "text2vec-openai",
}

client.schema.create_class(class_obj)

In [9]:
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "All_nov_jobs",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-openai": {
          "baseURL": "https://api.openai.com",
          "model": "ada",
          "modelVersion": "002",
          "type": "text",
          "vectorizeClassName": true
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [],
      "replicationConfig": {
        "factor": 1
      },
      "shardingConfig": {
        "virtualPerPhysical": 128,
        "desiredCount": 1,
        "actualCount": 1,
        "desiredVirtualCount": 128,
        "actualVirtualCount": 128,
        "key": "_id",
        "strategy": "hash",
        "function": "murmur3"
      },
      "vectorIndexConf

In [10]:
with client.batch as batch:  # Context manager manages batch flushing
    batch.batch_size=50
    batch.dynamic=True
    for data_obj in dataloader():
        batch.add_data_object(
            data_obj, 
            class_name
        )

In [11]:
jprint(data_obj)

{
  "title": "Software Engineer II - C/C++/OS",
  "company": "Microsoft",
  "company_link": "https://www.linkedin.com/company/1035/",
  "place": "Maine, United States Remote",
  "date": "2022-11-04",
  "apply_link": "https://careers.microsoft.com/us/en/job/1454432/Software-Engineer-II-C-C-OS?jobsource=linkedin&utm_source=linkedin&utm_medium=linkedin&utm_campaign=linkedin-feed",
  "post_link": "https://www.linkedin.com/jobs/view/3342681548/?eBP=JOB_SEARCH_ORGANIC&recommendedFlavor=IN_NETWORK&refId=Ww0RUXjRPh5ffnUwKmZ9VA%3D%3D&trackingId=vO8wOxIoXKrMhh4Ac9LkGQ%3D%3D&trk=flagship3_search_srp_jobs",
  "seniority_level": " Entry level",
  "employment_type": "Full-time ",
  "description": "Azure Edge + Platform brings together Edge platforms, devices, and services to deliver Edge solutions, operating systems, and engineering systems. Driven by its customers\u2019 needs, Azure Edge + Platform seeks to accelerate growth for Azure, E&D, and Microsoft\u2019s customers worldwide.\u202f\u202f\n\n\

In [12]:
jprint(client.query.aggregate(class_name).with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "All_nov_jobs": [
        {
          "meta": {
            "count": 16838
          }
        }
      ]
    }
  }
}


In [13]:
results = client.query.get(
    class_name, ["title", "place", "description"]
).with_near_text(
    {"concepts": ["software engineer"]}
).with_additional(
    ["distance", "id"]
).with_limit(5).do()
jprint(results)

{
  "data": {
    "Get": {
      "All_nov_jobs": [
        {
          "_additional": {
            "distance": 0.122837186,
            "id": "8bb6d017-cd18-425c-8e6f-26713ce9c9dd"
          },
          "description": "A Software Engineer is a highly skilled software developer with expert knowledge in at least one department or system. They work with a high degree of independence and develop information systems to support the business. Engineers are primarily focused on the design or integration of various software systems, databases, and third-party packages.\n\nEssential Job Functions\n\nGenerally, takes leadership role in guiding projects through the complete software development life cycle\nSolid understanding of secure coding techniques and best practices\nResponsible for multiple software applications\nManage tasks and resources while utilizing modern project management principles\nParticipate or lead the development of multiple projects at one time\nConducts code reviews and e