In [1]:
from pymongo import MongoClient
from datetime import datetime
import random
import string
from sent2vec.vectorizer import Vectorizer

In [2]:
def search_embedding(search_term):
    """
    Embeds the given search term using Sent2Vec vectorization.

    :param search_term: The search term to be embedded.
    :return: The vector representation of the search term.
    """

    vectorizer = Vectorizer()  # Create a Sent2Vec Vectorizer
    vectorizer.run([search_term])  # Run the vectorization process on the given search term
    return vectorizer.vectors[0].tolist()  # Retrieve the vector representation of the search term

In [3]:
def generate_random_string(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for _ in range(length))

def generate_sample_data(index):
    titles = ["Python", "Mongo", "Web Development", "Machine Learning", "Data Science", "Test"]
    content = " ".join(["Lorem ipsum"] * random.randint(5, 10))
    
    return {
        "title": titles[index],
        "content": content,
        "added": datetime.utcnow(),
        "url": f"http://{generate_random_string(8)}.com",
        "vector": search_embedding(titles[index]),
        "relevance": random.uniform(0, 1)
    }

def insert_sample_data(database_name, collection_name, num_records):
    client = MongoClient("mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.0.2")
    db = client[database_name]
    collection = db[collection_name]

    for i in range(num_records):
        data = generate_sample_data(i)
        collection.insert_one(data)

In [4]:
# Specify the MongoDB database and collection names
database_name = "searchDatabase"
collection_name = "Website"

# Specify the number of sample records to insert
num_records = 6

# Insert sample data into the MongoDB collection
insert_sample_data(database_name, collection_name, num_records)

print(f"{num_records} sample records inserted into the '{collection_name}' collection.")

Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
6 sample records inserted into the 'Website' collection.
