In [1]:
!pip install -qU datasets pandas pymongo sentence-transformers transformers
# Install below if using GPU
!pip install -qU accelerate

In [13]:
# Load Dataset
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/embedded_movies
dataset = load_dataset("AIatMongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

# Get length of df
print(f"Number of records in dataframe: {len(dataset_df)}")

Number of records in dataframe: 1500


In [14]:
# Data Preparation

# Remove data point where plot column is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])
print(f"Number of records after dropping empty plots: {len(dataset_df)}")

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI embedding Model "text-embedding-3-small"
dataset_df = dataset_df.drop(columns=['plot_embedding'])
dataset_df.head(1)

Number of records after dropping empty plots: 1452


Unnamed: 0,rated,fullplot,imdb,metacritic,num_mflix_comments,type,plot,runtime,countries,title,writers,awards,languages,cast,genres,poster,directors
0,,Young Pauline is left a lot of money when her ...,"{'id': 4465, 'rating': 7.6, 'votes': 744}",,0,movie,Young Pauline is left a lot of money when her ...,199.0,[USA],The Perils of Pauline,"[Charles W. Goddard (screenplay), Basil Dickey...","{'nominations': 0, 'text': '1 win.', 'wins': 1}",[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",[Action],https://m.media-amazon.com/images/M/MV5BMzgxOD...,"[Louis J. Gasnier, Donald MacKenzie]"


In [15]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

dataset_df.head(1)

Unnamed: 0,rated,fullplot,imdb,metacritic,num_mflix_comments,type,plot,runtime,countries,title,writers,awards,languages,cast,genres,poster,directors,embedding
0,,Young Pauline is left a lot of money when her ...,"{'id': 4465, 'rating': 7.6, 'votes': 744}",,0,movie,Young Pauline is left a lot of money when her ...,199.0,[USA],The Perils of Pauline,"[Charles W. Goddard (screenplay), Basil Dickey...","{'nominations': 0, 'text': '1 win.', 'wins': 1}",[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",[Action],https://m.media-amazon.com/images/M/MV5BMzgxOD...,"[Louis J. Gasnier, Donald MacKenzie]","[-0.009285839274525642, -0.005062091629952192,..."


In [20]:
import pymongo
import getpass


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = getpass.getpass("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client["movies"]
collection = db["movie_collection"]

MONGO_URI········
Connection to MongoDB successful


In [21]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff00000000000000f6'), 'opTime': {'ts': Timestamp(1710265561, 249), 't': 246}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1710265561, 249), 'signature': {'hash': b'4\x9fw+\xa3\x01\xb4\x12?\x06~H\x1a\xd26\x96\x0b\xc6\xf2|', 'keyId': 7299545392000008318}}, 'operationTime': Timestamp(1710265561, 249)}, acknowledged=True)

In [22]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [23]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "fullplot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [24]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = []
    for result in get_knowledge:
        search_result.append({"title": result.get('title', 'N/A'), "text": result.get('fullplot', 'N/A')})

    return search_result

In [26]:
# Conduct query with retrival of sources
query = "What is the best romantic movie to watch and why?"
source_information = get_search_result(query, collection)

for source in source_information:
    print(source)
    print("--------")

{'title': 'Shut Up and Kiss Me!', 'text': "Ryan and Pete are 27-year old best friends in Miami, born on the same day and each searching for the perfect woman. Ryan is a rookie stockbroker living with his psychic Mom. Pete is a slick surfer dude yet to find commitment. Each meets the women of their dreams on the same day. Ryan knocks heads in an elevator with the gorgeous Jessica, passing out before getting her number. Pete falls for the insatiable Tiara, but Tiara's uncle is mob boss Vincent Bublione, charged with her protection. This high-energy romantic comedy asks to what extent will you go for true love?"}
--------
{'title': 'Pearl Harbor', 'text': 'Pearl Harbor is a classic tale of romance set during a war that complicates everything. It all starts when childhood friends Rafe and Danny become Army Air Corps pilots and meet Evelyn, a Navy nurse. Rafe falls head over heels and next thing you know Evelyn and Rafe are hooking up. Then Rafe volunteers to go fight in Britain and Evelyn 

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "CohereForAI/c4ai-command-r-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/7.92k [00:00<?, ?B/s]

tokenization_cohere_fast.py:   0%|          | 0.00/43.7k [00:00<?, ?B/s]

configuration_cohere.py:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/CohereForAI/c4ai-command-r-v01:
- configuration_cohere.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/CohereForAI/c4ai-command-r-v01:
- tokenization_cohere_fast.py
- configuration_cohere.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.json:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

modeling_cohere.py:   0%|          | 0.00/60.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/CohereForAI/c4ai-command-r-v01:
- modeling_cohere.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/26.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00007-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00008-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00009-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00010-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]



model-00011-of-00015.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
# define conversation input:
conversation = [
    {"role": "user", "content": "What is the best romantic movie to watch and why?"}
]

# render the tool use prompt as a string:
grounded_generation_prompt = tokenizer.apply_grounded_generation_template(
    conversation,
    documents=source_information,
    citation_mode="fast"
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))