In [None]:
!pip install -U weaviate-client

In [None]:
!pip install Spacy

In [None]:
import sys
!{sys.executable} -m spacy download en

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [None]:
!pip install faker
!pip install nltk
!pip install gensim

In [2]:
import weaviate
import weaviate.classes as wvc
import requests
import json


In [3]:
client = weaviate.connect_to_wcs(
    cluster_url="",
    auth_credentials=weaviate.auth.AuthApiKey("")
    
)


In [4]:
client.is_ready()

True

In [14]:
import weaviate.classes.config as wc
from weaviate.classes.config import Configure, Property, DataType
client.collections.create(
    name="DocumentSearch",
    properties=[
        wc.Property(name="DocText", data_type=wc.DataType.TEXT),
        
    ],
    # Define the vectorizer module (none, as we will add our own vectors)
    vectorizer_config=wc.Configure.Vectorizer.none(),
    #vector_index_config=Configure.VectorIndex.hnsw(),
)

<weaviate.collections.collection.Collection at 0x263a5344c90>

In [6]:
documents = [
    "SQLServer Oracle PowerBI",
    "SQLServer PowerBI Tableau",
    "Tableau Qlikview",
    "Qlikview SnowFlake",
    "Databricks Azure Data Factory"
     ]
DocumentSearch = client.collections.get("DocumentSearch")
with DocumentSearch.batch.dynamic() as batch:
    for i, doc in enumerate(documents):
        properties = {"DocText": doc}
        nlpdoc = nlp(doc)
        vector=list(nlpdoc.vector)
        batch.add_object(properties,vector=vector)

In [7]:
collection = client.collections.get("DocumentSearch")

for item in collection.iterator(
    include_vector=True  # If using named vectors, you can specify ones to include e.g. ['title', 'body'], or True to include all
):
    print(item.properties)
    print(item.vector)

{'docText': 'Qlikview SnowFlake'}
{'default': [-0.8692803978919983, -1.2522401809692383, 0.5525250434875488, 0.9107866287231445, -0.050938963890075684, 0.14994093775749207, 0.43748974800109863, 0.818988025188446, -0.16257141530513763, 0.09473907947540283, 1.772829294204712, 0.9506538510322571, -1.2128441333770752, -0.8976781964302063, -0.9202388525009155, -0.1891864687204361, -0.3299974501132965, 0.17715154588222504, -0.25794848799705505, -0.17378157377243042, -0.6509526968002319, -0.6286700367927551, -0.9204490184783936, 0.02123124897480011, 0.32411131262779236, 0.5413473844528198, 0.8470343351364136, 1.116644024848938, 0.2638291120529175, 0.5469329357147217, -0.4800737202167511, -0.8007828593254089, -0.029460713267326355, 0.5881748199462891, -1.0660278797149658, 0.1396113932132721, 0.17329102754592896, 0.7320780158042908, -0.3590658903121948, -0.70733642578125, -1.0256309509277344, 0.9948383569717407, -0.8657605648040771, 0.6959241628646851, -0.4245043396949768, 0.1685459017753601, -

In [8]:
import weaviate.classes.query as wq
query_text = "Databricks"
doc = nlp(query_text)
query_vector= list(doc.vector)
# Get the collection
SkillSet= client.collections.get("DocumentSearch")

# Perform query
response = SkillSet.query.near_vector(
    near_vector=query_vector,  # A list of floating point numbers
    limit=5,
    return_metadata=wq.MetadataQuery(distance=True),
)

# Inspect the response
for o in response.objects:
    print(
        o.properties["docText"] 
    )  
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )  # P

Qlikview SnowFlake
Distance to query: 0.476

Tableau Qlikview
Distance to query: 0.547

Databricks Azure Data Factory
Distance to query: 0.582

SQLServer Oracle PowerBI
Distance to query: 0.653

SQLServer PowerBI Tableau
Distance to query: 0.683



In [9]:
import weaviate
from weaviate.classes.query import Filter
from weaviate.collections import Collection
from typing import List

# Instantiate your client (not shown). e.g.:
# client = weaviate.connect_to_wcs(...) or
# client = weaviate.connect_to_local()

collection = client.collections.get("DocumentSearch")

# Get property names
property_names = list()
for p in collection.config.get().properties:
    property_names.append(p.name)

query_strings = ["DataBricks:"]


def filter_demo(collection: Collection, property_names: List[str], query_strings: List[str]):
    from weaviate.classes.query import Filter

    for query_string in query_strings:
        print("\n" + "=" * 40 + f"\nHits for: '{query_string}'" + "\n" + "=" * 40)
        for property_name in property_names:
            response = collection.query.fetch_objects(
                filters=Filter.by_property(property_name).equal(query_string),
            )
            if len(response.objects) > 0:
                print(f">> '{property_name}' matches")
                for obj in response.objects:
                    print(obj.properties[property_name])


filter_demo(collection, property_names, query_strings)


Hits for: 'DataBricks:'
>> 'docText' matches
Databricks Azure Data Factory


In [10]:
import weaviate.classes.config as wc
from weaviate.classes.config import Configure, Property, DataType
client.collections.create(
    name="NameSearch",
    properties=[
        wc.Property(name="PersonName", data_type=wc.DataType.TEXT),
        
    ],
    # Define the vectorizer module (none, as we will add our own vectors)
    vectorizer_config=wc.Configure.Vectorizer.none(),
    vector_index_config=Configure.VectorIndex.hnsw(),
)

<weaviate.collections.collection.Collection at 0x263a1098090>

In [11]:
from faker import Faker  
import spacy
nlp = spacy.load('en_core_web_sm')
import time

t0 = time.time()

fake = Faker()  
DocumentSearch = client.collections.get("NameSearch")
with DocumentSearch.batch.dynamic() as batch:
    for i in range(0, 100000): 
        fakename =fake.name()
        if (i==1):
            print(fakename)  
        properties = {"personName": fakename}
        nlpdoc = nlp(fakename)
        vector=nlpdoc.vector
        batch.add_object(properties,vector=vector)
t1 = time.time()
total = t1-t0
print(total)  

Stacey Brady
575.4583711624146


In [12]:
import weaviate.classes.query as wq
query_text = "Stacey Brady"
import time
doc = nlp(query_text)
query_vector= list(doc.vector)
# Get the collection
SkillSet= client.collections.get("NameSearch")


t0 = time.time()
# Perform query
response = SkillSet.query.near_vector(
    near_vector=query_vector,  # A list of floating point numbers
    limit=5,
    return_metadata=wq.MetadataQuery(distance=True),
)

# Inspect the response
for o in response.objects:
    print(
        o.properties["personName"] 
    )  
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )  # P
t1 = time.time()
total = t1-t0
print('Time Taken')
print(total)  

Stacey Brady
Distance to query: -0.000

Stacey Patterson
Distance to query: 0.050

Stacey Garcia
Distance to query: 0.056

Stacey Frank
Distance to query: 0.063

Stacey Nelson
Distance to query: 0.064

Time Taken
0.6067109107971191


In [13]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import FastText
nltk.download('punkt')
documents = [
    "SQLServer Oracle PowerBI",
    "SQLServer PowerBI Tableau",
    "Tableau Qlikview",
    "Qlikview SnowFlake",
    "Databricks Azure Data Factory"
     ]

df = pd.DataFrame(columns=['Skills'])
for i, doc in enumerate(documents):
    df.loc[len(df.index)] = doc

#display(df)
Sentences=[word_tokenize(skill.lower()) for skill in df.Skills]
print(Sentences)
model=FastText(Sentences,vector_size=128,window=5,min_count=1,workers=4,epochs=10,seed=42,sg=1)
model.save('Skills.ft')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['sqlserver', 'oracle', 'powerbi'], ['sqlserver', 'powerbi', 'tableau'], ['tableau', 'qlikview'], ['qlikview', 'snowflake'], ['databricks', 'azure', 'data', 'factory']]


In [15]:
ftext=model.wv
documents = [
    "SQLServer Oracle PowerBI",
    "SQLServer PowerBI Tableau",
    "Tableau Qlikview",
    "Qlikview SnowFlake",
    "Databricks Azure Data Factory"
     ]
DocumentSearch = client.collections.get("DocumentSearch")
with DocumentSearch.batch.dynamic() as batch:
    for i, doc in enumerate(documents):
        properties = {"DocText": doc}
        vector=list(ftext[doc])
        batch.add_object(properties,vector=vector)


In [16]:
collection = client.collections.get("DocumentSearch")

for item in collection.iterator(
    include_vector=True  # If using named vectors, you can specify ones to include e.g. ['title', 'body'], or True to include all
):
    print(item.properties)
    print(item.vector)

{'docText': 'SQLServer Oracle PowerBI'}
{'default': [0.00043863116297870874, 0.0003017366980202496, 0.00026605691527947783, -0.00014129586634226143, -0.00013196001236792654, 0.00037896199501119554, 0.0008747750543989241, 0.0004178764938842505, -0.0001820102334022522, 0.00031479416065849364, 0.0002034965145867318, -0.0004460813361220062, 0.00014140472922008485, 3.23128369927872e-05, 0.0006345723522827029, 0.0004042675136588514, -0.0001430584379704669, 0.0005137426196597517, 0.0007221846026368439, -0.0008656759164296091, -0.0004029560077469796, -0.00032190437195822597, 0.00016971862351056188, -0.0005283651407808065, -0.000613134412560612, -6.58315620967187e-05, 0.0002679212193470448, 0.0007777400896884501, -1.3988744285597932e-05, 0.00030850659823045135, -0.00019997403433080763, -6.0770296840928495e-05, -0.00019799279107246548, -0.00040843471651896834, 3.447963081271155e-06, -0.00019615291967056692, -0.0009281935053877532, 4.961047670803964e-05, -0.0003391017089597881, 0.0004347361682448

In [17]:
import weaviate.classes.query as wq
query_text = "Databricks"
query_vector= list(ftext[query_text])
# Get the collection
SkillSet= client.collections.get("DocumentSearch")

# Perform query
response = SkillSet.query.near_vector(
    near_vector=query_vector,  # A list of floating point numbers
    limit=5,
    return_metadata=wq.MetadataQuery(distance=True),
)

# Inspect the response
for o in response.objects:
    print(
        o.properties["docText"] 
    )  
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )  # P

Databricks Azure Data Factory
Distance to query: 0.551

SQLServer PowerBI Tableau
Distance to query: 0.809

Qlikview SnowFlake
Distance to query: 0.942

SQLServer Oracle PowerBI
Distance to query: 0.947

Tableau Qlikview
Distance to query: 1.046

