<a href="https://colab.research.google.com/github/Fahad-Blog/Data-Science-Portfolio/blob/main/Semantic_Search_(MongoDB).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
df = pd.read_csv('/sentiment-analysis.csv', sep=', ', engine = 'python')

In [None]:
df.head()

Unnamed: 0,"""Text",Sentiment,Source,Date/Time,User ID,Location,"Confidence Score"""
0,"""""""I love this product!""""",Positive,Twitter,2023-06-15 09:23:14,@user123,New York,"0.85"""
1,"""""""The service was terrible.""""",Negative,Yelp Reviews,2023-06-15 11:45:32,user456,Los Angeles,"0.65"""
2,"""""""This movie is amazing!""""",Positive,IMDb,2023-06-15 14:10:22,moviefan789,London,"0.92"""
3,"""""""I'm so disappointed with their customer sup...",Negative,Online Forum,2023-06-15 17:35:11,forumuser1,Toronto,"0.78"""
4,"""""""Just had the best meal of my life!""""",Positive,TripAdvisor,2023-06-16 08:50:59,foodie22,Paris,"0.88"""


In [None]:
df = df.rename(columns={'"Text': 'text'})
df = df.rename(columns={'Confidence Score"':'confidence_score'})

In [None]:
df.head()

Unnamed: 0,text,Sentiment,Source,Date/Time,User ID,Location,confidence_score
0,"""""""I love this product!""""",Positive,Twitter,2023-06-15 09:23:14,@user123,New York,"0.85"""
1,"""""""The service was terrible.""""",Negative,Yelp Reviews,2023-06-15 11:45:32,user456,Los Angeles,"0.65"""
2,"""""""This movie is amazing!""""",Positive,IMDb,2023-06-15 14:10:22,moviefan789,London,"0.92"""
3,"""""""I'm so disappointed with their customer sup...",Negative,Online Forum,2023-06-15 17:35:11,forumuser1,Toronto,"0.78"""
4,"""""""Just had the best meal of my life!""""",Positive,TripAdvisor,2023-06-16 08:50:59,foodie22,Paris,"0.88"""


In [None]:
import os
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
# --- CONFIGURATION ---
# Replace with your actual connection string from Atlas
MONGO_URI = "Instruction : Get your MongoDB URI by creating a new cluster"
DB_NAME = "feedback_db"
COLLECTION_NAME = "reviews"

# 1. Connect to MongoDB
try:
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    print("‚úÖ Connected to MongoDB Atlas")
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    exit()


# 2. Load the Embedding Model
# This downloads a small, free model from HuggingFace (runs locally on your laptop)
print("‚è≥ Loading AI Model (this happens once)...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. The Data (Simulating a CSV load)
# In real life, you would do: df = pd.read_csv('reviews.csv')
# Convert DataFrame rows to a list of dictionaries for easier processing
sample_reviews_for_embedding = df.to_dict(orient='records')

# 4. Generate Embeddings & Insert Data
print("üöÄ Generating Embeddings and Indexing Data...")
documents_to_insert = []

for doc in sample_reviews_for_embedding:
    # This is the "Magic": Text -> [0.1, -0.4, ...]
    vector_embedding = model.encode(doc['text']).tolist()

    # Add the vector to the document
    doc['embedding'] = vector_embedding
    documents_to_insert.append(doc)

if collection.count_documents({}) == 0:
    collection.insert_many(documents_to_insert)
    print("‚úÖ Data inserted into MongoDB!")
else:
    print("‚ÑπÔ∏è Data already exists, skipping insertion.")


# --- THE SEARCH PHASE ---

def semantic_search(query, limit=2):
    print(f"\nüîé Searching for: '{query}'")

    # 1. Convert the USER'S query into a vector
    query_vector = model.encode(query).tolist()

    # 2. The Vector Search Pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index", # Must match the name you gave in Atlas UI
                "path": "embedding",     # The field containing the vectors
                "queryVector": query_vector,
                "numCandidates": 200,    # Number of nearest neighbors to look at
                "limit": limit           # Number of results to return
            }
        },
        {
            "$project": {
                "_id": 0,
                "text": 1,
                "category": 1,
                "score": {"$meta": "vectorSearchScore"} # Show how confident the AI is
            }
        }
    ]

    # 3. Execute
    results = collection.aggregate(pipeline)

    # 4. Print Results
    for result in results:
        print(f"   [Score: {result['score']:.4f}] {result['text']}")

# Test Cases
# Notice: The word "angry" or "expensive" is NOT in the dataset, but it finds the right concept!
semantic_search("Why are customers angry?")
semantic_search("Negative feedback")

‚úÖ Connected to MongoDB Atlas
‚è≥ Loading AI Model (this happens once)...
üöÄ Generating Embeddings and Indexing Data...
‚ÑπÔ∏è Data already exists, skipping insertion.

üîé Searching for: 'Why are customers angry?'
   [Score: 0.7763] """I'm so disappointed with their customer support.""
   [Score: 0.7763] """I'm so disappointed with their customer support.""

üîé Searching for: 'Negative feedback'
   [Score: 0.6295] """This song always puts me in a good mood. It's my go-to feel-good track!""
   [Score: 0.6210] """I'm extremely disappointed with their product quality.""
