# $$ Semantic \ Search \ Engine $$

## Importing Libraries and Modules

In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from chromadb import PersistentClient
from chromadb.utils import embedding_functions
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

## Loading English Subtitles Dataset from CSV File

In [3]:
PATH = r"/kaggle/input/eng-subtitle/clean_eng_subtitles_csv.csv"
data = pd.read_csv(PATH) 

In [4]:
data = data.drop('Unnamed: 0',axis=1)
data['file_content'] = data['file_content'].str.lower()
data['name'] = data['name'].str.lower()
data['name'] = data['name'].str.replace('eng 1cd', '').str.strip()
ids = data['num'].astype(str).tolist()

## Initializing ChromaDB Client and Setting Up Embedding Function

In [9]:
chroma_client = PersistentClient(path="my_chromadb")

# Specify the model name
model_name = "all-mpnet-base-v2"

# Create an embedding function for ChromaDB using the model name
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_name
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Creating ChromaDB Collection

In [10]:
# Create or get the collection, aka vector database
# Specify the name for the collection and the embedding function to use
collection_name = "subtitles_collection"
try:
    collection = chroma_client.get_collection(name=collection_name, embedding_function=sentence_transformer_ef)
except ValueError:
    print(f"Collection '{collection_name}' does not exist. Creating a new collection.")
    collection = chroma_client.create_collection(name=collection_name, embedding_function=sentence_transformer_ef)

Collection 'subtitles_collection' does not exist. Creating a new collection.


## Chunking Documents

In [13]:
token_size = 500 
overlap_size = 50 
documents = []
metadatas = []


def chunk_document(document, metadata, doc_id, token_size, overlap_size):
    tokens = word_tokenize(document)
    start = 0
    
    while start < len(tokens):
        end = min(start + token_size, len(tokens))
        chunk = ' '.join(tokens[start:end])
        
        documents.append(chunk)
        metadatas.append(metadata)
        
        start = end - overlap_size

for i, row in data.iterrows():
    document = row['file_content']
    metadata = {'num': row['num'], 'name': row['name']}
    
    chunk_document(document, metadata, str(row['num']), token_size, overlap_size)

## Adding Documents, Metadata and ids to ChromaDB Collection

In [16]:
print("Adding documents to the collection...")
with tqdm(total=len(documents), desc="Progress", unit="document", ncols=100) as pbar:
    # Add documents, metadata, IDs, and embeddings in a single operation
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    pbar.update(len(documents))

print("Finished adding documents to the collection.")

Adding documents to the collection...


Progress:   0%|                                                     | 0/24749 [00:00<?, ?document/s]

Batches:   0%|          | 0/774 [00:00<?, ?it/s]

Progress: 100%|███████████████████████████████████████| 24749/24749 [4:00:12<00:00,  1.72document/s]

Finished adding documents to the collection.





## Search Function to Query ChromaDB Collection and retrieving Results

In [24]:
def search(query, collection):
    # Query the collection using the provided query text
    results = collection.query(
        query_texts=[query],
        n_results=10,  # Specify the number of results you want to retrieve
        include=['documents', 'distances', 'metadatas','ids']
    )

    # Iterate through the results and print the movie ID (num), similarity score, and movie name
    for j in range(len(results['ids'][0])):
        movie_id = results["ids"][0][j]
        distance = results['distances'][0][j]
        metadata = results['metadatas'][0][j]
        
        print(f"Movie ID (num): {movie_id}")
        print(f"Similarity Score (distance): {distance:.3f}")
        print(f"Movie Name: {metadata['name']}")

In [25]:
query_text = input("Please enter your search query: ")
search(query_text, collection)

Please enter your search query:  comedy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Movie ID (num): 9253374
Similarity Score (distance): 0.878
Movie Name: pioneers of television s04 e01 standup to sitcom (2014)
Movie ID (num): 9207796
Similarity Score (distance): 0.946
Movie Name: south park the 25th anniversary concert (2022)
Movie ID (num): 9309885
Similarity Score (distance): 1.020
Movie Name: rifftrax the dark power (2015)
Movie ID (num): 9207793
Similarity Score (distance): 1.049
Movie Name: south park the 25th anniversary concert (2022)
Movie ID (num): 9416111
Similarity Score (distance): 1.062
Movie Name: alf tales s01 e09 the princess and the pea (1988)
Movie ID (num): 9298557
Similarity Score (distance): 1.062
Movie Name: the comedy man (1964)
Movie ID (num): 9219912
Similarity Score (distance): 1.064
Movie Name: beavis and butthead s01 e05 nice buttheadhome aide (2022)
Movie ID (num): 9184389
Similarity Score (distance): 1.065
Movie Name: impractical jokers s09 e22 chris jericho (2022)
Movie ID (num): 9209329
Similarity Score (distance): 1.073
Movie Name: th

In [26]:
query_text = input("Please enter your search query: ")
search(query_text, collection)

Please enter your search query:  romantic

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Movie ID (num): 9255305
Similarity Score (distance): 1.134
Movie Name: a love song (2022)
Movie ID (num): 9456842
Similarity Score (distance): 1.136
Movie Name: miraculous tales of ladybug cat noir s01 e10 dark cupid (2015)
Movie ID (num): 9370218
Similarity Score (distance): 1.160
Movie Name: the mindy project s01 e02 hiring and firing (2012)
Movie ID (num): 9327955
Similarity Score (distance): 1.179
Movie Name: coolie no 1 (2020)
Movie ID (num): 9192986
Similarity Score (distance): 1.180
Movie Name: notater om kaerligheden (1989)
Movie ID (num): 9472673
Similarity Score (distance): 1.181
Movie Name: agir romantik (2020)
Movie ID (num): 9289398
Similarity Score (distance): 1.198
Movie Name: romantic killer s01 e01 why is there so much legalese in magic (2022)
Movie ID (num): 9193134
Similarity Score (distance): 1.207
Movie Name: the best of everything (1959)
Movie ID (num): 9477598
Similarity Sc