### 1.Load the cleaned_eng_subtitle.csv file

In [1]:
# ! pip install sentence-transformers

In [2]:
# Importing the required libraries
import pandas as pd
import numpy as np

# Loading the dataset
data = pd.read_csv("Dataset/cleaned_eng_subtitle.csv")
data.head()

Unnamed: 0,Subtitle_Id,Subtitle_Name,Subtitle_Content
0,9450386,person of interest s03 e19 most likely to (2014),script info title default file scripttype vwra...
1,9376836,bang s02 e05 episode 2 5 (2020),watch any video online with opensubtitles free...
2,9511930,criminal minds s03 e06 about face (2007),ducks quacking advertise your product or brand...
3,9381135,the repair shop s07 e18 silver salt and pepper...,welcome to the repair shop where precious but ...
4,9450349,person of interest s03 e11 lethe (2013),script info title default file scripttype vwra...


In [11]:
## Use the sample function to take a random subset of 1k rows
# data = data.sample(n=1000, random_state=42)  # Set random_state for reproducibility
# data = data.reset_index(drop=True)  # Reset index
# data.head()

### Step 2: Generating Text Vectors of Subtitle Content


#### a.Generate BOW / TFIDF sparse vector representations

In [4]:
# Import the required libraries
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Generate BOW / TFIDF sparse vector representations
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(data['Subtitle_Content'])

# Display the shape of the TF-IDF matrix
# print("TF-IDF matrix shape:", tfidf_vectors.shape)

#### b. BERT based "SentenceTransformers" to generate embeddings

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Step 2: Generate BERT-based embeddings
sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_embeddings = np.array([sentence_model.encode(subtitle) for subtitle in data['Subtitle_Content']])

# Display the shape of the BERT embeddings
# print("BERT embeddings shape:", bert_embeddings.shape)


#### Part 2: Retrieving Documents

1. Take the user's search query.
2. Preprocess the query (if required).
3. Create query embedding.
4. Using cosine distance, calculate the similarity score between embeddings of documents and user search query embedding.
5. These cosine similarity scores will help in returning the most relevant candidate documents as per the user’s search query.


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Take the user's search query.
user_query = input("Please Enter Movies and TV Series Name: ")
#user_query = "crime investigation thriller"

# 2. Preprocess the query (if required).
def preprocess_query(query):
    # Perform necessary preprocessing steps
    # Remove timestamps, HTML tags, special characters, etc.
    # Convert text to lowercase
    clean_text = query.lower()
    return clean_text

# 3. Create query embedding.
preprocessed_query = preprocess_query(user_query)

# 4. Using cosine distance, calculate the similarity score between embeddings of documents and user search query embedding.
user_query_embedding = sentence_model.encode(preprocessed_query)

# 5. These cosine similarity scores will help in returning the most relevant candidate documents as per the user’s search query.
def retrieve_documents(user_query_embedding, document_embeddings, subtitle_data, top_n=10):
    # Calculate cosine similarity between user query embedding and document embeddings
    similarity_scores = cosine_similarity(user_query_embedding.reshape(1, -1), document_embeddings).flatten()
    
    # Get indices of top n most similar documents
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Retrieve top n documents
    top_documents = subtitle_data.iloc[top_indices]
    
    return top_documents, similarity_scores[top_indices]

# Example usage:
top_documents, similarity_scores = retrieve_documents(user_query_embedding, bert_embeddings, data, top_n=10)
print("Top Documents:")
for index, row in top_documents.iterrows():
    print(f"Subtitle Name: {row['Subtitle_Name']}")
    print(f"Subtitle Content: {row['Subtitle_Content']}")
    print("------")
print("Similarity Scores:")
print(similarity_scores)

Top Documents:
Subtitle Name: tokyo ghoul re s02 e06 face effulgence (2018)
Subtitle Content: script info title horriblesubs scripttype vwrapstyle playresx playresy scaledborderandshadow yes vstyles format name fontname fontsize primarycolour secondarycolour outlinecolour backcolour bold italic underline strikeout scalex scaley spacing angle borderstyle outline shadow alignment marginl marginr marginv encoding style defaultopen sans semiboldhffffffhffhhstyle mainopen sans semiboldhffffffhffhhstyle italicsopen sans semiboldhffffffhffhhstyle flashbackopen sans semiboldhffffffhffhhstyle flashbackitalicsopen sans semiboldhffffffhffhhstyle signthosewhohuntopen sans semiboldhffffffhffhhstyle character names rightopen sans semiboldhffffffhffhdhstyle signtokyoghoulreopen sans semiboldhffffffhffhhstyle signbrillianceopen sans semiboldhddhffhhstyle signhiroshimastyleopen sans semiboldhchffhedeehstyle signgrilledsquidopen sans semiboldhahffhaddhstyle signthwardopen sans semiboldhffffffhffhbdhstyl

#### 4. Document Chunker

In [7]:
# Step 3: Document Chunker
def document_chunker(document, chunk_size=500, overlap_size=100):
    chunks = []
    words = document.split()
    for i in range(0, len(words), chunk_size - overlap_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

#### 5. Store embeddings in a ChromaDB database

In [8]:
from chromadb import PersistentClient
from chromadb.utils import embedding_functions

# Initialize lists for documents, metadatas, and ids
subtitle_content = data['Subtitle_Content'].tolist() 
subtitle_name = data[["Subtitle_Id","Subtitle_Name"]].to_dict(orient='records')
subtitle_id = data["Subtitle_Id"].astype(str).tolist() 

# Initialize ChromaDB Client and Set Up Embedding Function
chroma_client = PersistentClient(path="vectordb")
model_name = "all-mpnet-base-v2"
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_name
)

# Creating ChromaDB Collection
collection_name = "eng_subtitles_collection"
try:
    collection = chroma_client.get_collection(name=collection_name, embedding_function=sentence_transformer_ef)
except ValueError:
    print(f"Collection '{collection_name}' does not exist. Creating a new collection.")
    collection = chroma_client.create_collection(name=collection_name, embedding_function=sentence_transformer_ef)


# Adding Documents, Metadata, and IDs to ChromaDB Collection
print("Adding documents to the collection...")
# Add documents, metadata, IDs, and embeddings in a single operation
collection.add(
    documents=subtitle_content,
    embeddings=bert_embeddings,
    metadatas=subtitle_name,  # Pass the metadata dictionary
    ids=subtitle_id
)

print("Finished adding documents to the collection.")

Collection 'eng_subtitles_collection' does not exist. Creating a new collection.
Adding documents to the collection...
Finished adding documents to the collection.


#### Part 2: Retrieving Documents

1. Take the user's search query.
2. Preprocess the query (if required).
3. Create query embedding.
4. Using cosine distance, calculate the similarity score between embeddings of documents and user search query embedding.
5. These cosine similarity scores will help in returning the most relevant candidate documents as per the user’s search query.


In [10]:
# 1. Take the user's search query.
query_text = input("Please enter your search query: ")

# 2. Preprocess the query (if required).
def preprocess_query(query):
    # Perform necessary preprocessing steps
    # Remove timestamps, HTML tags, special characters, etc.
    # Convert text to lowercase
    clean_text = query.lower()
    return clean_text

preprocessed_query = preprocess_query(query_text)

# 3. Create query embedding
query_embedding = sentence_model.encode([preprocessed_query])

# 4. Using cosine distance, calculate the similarity score between embeddings of documents and user search query embedding
results = collection.query(
    query_embeddings=query_embedding,
    n_results=10,  # Specify the number of results you want to retrieve
    include=['documents', 'distances', 'metadatas']  # Remove 'ids'
)

# 6. Display the results
print("Top Ten Movies and TV Series Subtitles:")
for j in range(len(results['ids'][0])):
    movie_id = results["ids"][0][j]
    distance = results['distances'][0][j]
    metadata = results['metadatas'][0][j]
        
    print(f"Movie ID (Subtitle_Id): {movie_id}")
    print(f"Similarity Score (Distance): {distance:.3f}")
    print(f"Movie Name (Subtitle_Name): {metadata['Subtitle_Name']}")


Top Ten Movies and TV Series Subtitles:
Movie ID (Subtitle_Id): 9392463
Similarity Score (Distance): 333.100
Movie Name (Subtitle_Name): the makanai cooking for the maiko house s01 e01 change (2023)
Movie ID (Subtitle_Id): 9276852
Similarity Score (Distance): 345.396
Movie Name (Subtitle_Name): welcome to demonschool irumakun s03 e02 master bachiko ()
Movie ID (Subtitle_Id): 9322103
Similarity Score (Distance): 345.396
Movie Name (Subtitle_Name): welcome to demonschool irumakun s03 e07 claras toy boxa night filled with screams (2022)
Movie ID (Subtitle_Id): 9231906
Similarity Score (Distance): 345.453
Movie Name (Subtitle_Name): overlord s04 e10 last king ()
Movie ID (Subtitle_Id): 9183718
Similarity Score (Distance): 347.394
Movie Name (Subtitle_Name): orient s02 e03 equals (2022)
Movie ID (Subtitle_Id): 9313008
Similarity Score (Distance): 348.177
Movie Name (Subtitle_Name): free s01 e06 shogeki no noburijingu (2013)
Movie ID (Subtitle_Id): 9508089
Similarity Score (Distance): 348.21