# Semantic (Similarity) Search

In [38]:
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
import pinecone
from sentence_transformers import SentenceTransformer

load_dotenv(find_dotenv(), override=True)

True

### Process the data, create Vectors, upload them to Pinecone

In [39]:
files = pd.read_csv('course_descriptions.csv', encoding='cp1252')
files.head()


Unnamed: 0,course_name,course_slug,course_technology,course_description,course_topic,course_description_short
0,Introduction to Tableau,tableau,tableau,Tableau is now one of the most popular busines...,data visualization,Teaching you how to tell compelling stories wi...
1,The Complete Data Visualization Course with Py...,data-visualization,python,The Data Visualization course is designed for ...,data visualization,Teaching you how to master the art of creating...
2,Introduction to R Programming,introduction-to-r-programming,r,R is one of the best programming languages spe...,programming,"Providing you with the skills to manipulate, a..."
3,Data Preprocessing with NumPy,data-preprocessing-numpy,python,This course is designed to show you how to wor...,data processing,This course will guide you through one of Pyth...
4,Introduction to Data and Data Science,intro-to-data-and-data-science,theory,Working with data is an essential part of main...,machine learning,Introducing you to the field of data science a...


In [40]:
def create_course_description(row):
    return f"""The course name is {row['course_name']}, the slug is {row['course_slug']}, 
    the technology is {row['course_technology']}, and the course topics is {row['course_topic']}.
    """



In [41]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_description, axis=1)
print(files['course_description_new'])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql, \n   ...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [42]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'), environment=os.getenv('PINECONE_ENV'))

In [43]:
index_name = 'udemy-course-semantic-search'
dimension = 768
metric = 'cosine'

In [65]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"Deleted index: {index_name}")
else:
    print(f"Index {index_name} does not exist")



Deleted index: udemy-course-semantic-search


In [66]:
pc.create_index(
    name=index_name, 
    dimension=dimension, 
    metric=metric,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "udemy-course-semantic-search",
    "metric": "cosine",
    "host": "udemy-course-semantic-search-4z04feo.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
         

In [46]:
index = pc.Index(index_name)

In [28]:
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1731.44it/s, Materializing param=pooler.dense.weight]                        
[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/multi-qa-mpnet-base-dot-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [47]:
files.head()

Unnamed: 0,course_name,course_slug,course_technology,course_description,course_topic,course_description_short,course_description_new
0,Introduction to Tableau,tableau,tableau,Tableau is now one of the most popular busines...,data visualization,Teaching you how to tell compelling stories wi...,"The course name is Introduction to Tableau, th..."
1,The Complete Data Visualization Course with Py...,data-visualization,python,The Data Visualization course is designed for ...,data visualization,Teaching you how to master the art of creating...,The course name is The Complete Data Visualiza...
2,Introduction to R Programming,introduction-to-r-programming,r,R is one of the best programming languages spe...,programming,"Providing you with the skills to manipulate, a...",The course name is Introduction to R Programmi...
3,Data Preprocessing with NumPy,data-preprocessing-numpy,python,This course is designed to show you how to wor...,data processing,This course will guide you through one of Pyth...,The course name is Data Preprocessing with Num...
4,Introduction to Data and Data Science,intro-to-data-and-data-science,theory,Working with data is an essential part of main...,machine learning,Introducing you to the field of data science a...,The course name is Introduction to Data and Da...


In [48]:
def create_embeddings(row):
    combined_text = ' '.join(str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short'])
    embeddings = model.encode(combined_text, show_progress_bar=False)
    return embeddings

In [49]:
files['embeddings'] = files.apply(create_embeddings, axis=1)

In [50]:
vectors_to_upsert = [(str(row['course_name']), row['embeddings'].tolist()) for _, row in files.iterrows()]
index.upsert(vectors=vectors_to_upsert)

print("Data upserted successfully")

Data upserted successfully


### Implementing Semantic Search 

In [54]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [55]:
query_results = index.query(
    vector=[query_embedding], # Formatted as a list as Pince ususally expects multiple vectors
    top_k=12,
    include_values=True
)
print(query_results)


QueryResponse(matches=[{'id': 'Machine Learning with K-Nearest Neighbors',
 'score': 0.393516034,
 'values': [0.300785333,
            -0.476736039,
            -0.136514843,
            -0.132084578,
            -0.22894679,
            -0.0632804856,
            -0.0271141417,
            -0.0131547228,
            0.389746338,
            0.36633575,
            0.100689754,
            0.572812259,
            0.0266329125,
            -0.0461975448,
            0.286393464,
            -0.535117269,
            0.286347717,
            0.206176892,
            0.264666975,
            -0.112505108,
            -0.287684262,
            -0.12583445,
            -0.176172093,
            0.0590221211,
            -0.0370017588,
            -0.183444545,
            -0.242686868,
            -0.153245836,
            -0.169617191,
            -0.367568523,
            0.23119016,
            -0.0813634247,
            0.351073921,
            0.29136306,
            -0.000114422779,


In [57]:
for match in query_results['matches']:
    print(f"Course Name: {match['id']} ,Score: {match['score']}")

Course Name: Machine Learning with K-Nearest Neighbors ,Score: 0.393516034
Course Name: Machine Learning in Excel ,Score: 0.379886866
Course Name: Growth Analysis with SQL, Python, and Tableau   ,Score: 0.329270869
Course Name: Mastering Key Performance Indicators (KPIs) ,Score: 0.326093465
Course Name: Machine Learning in Python ,Score: 0.31569311
Course Name: Machine Learning with Decision Trees and Random Forests ,Score: 0.311347157
Course Name: The Complete Data Visualization Course with Python, R, Tableau, and Excel ,Score: 0.301075161
Course Name: Excel for Project Management ,Score: 0.30100733
Course Name: Machine Learning with Support Vector Machines ,Score: 0.300667554
Course Name: Building Business Reports Using Power BI ,Score: 0.30042851
Course Name: Portfolio Management ,Score: 0.296011686
Course Name: Introduction to Tableau ,Score: 0.286260128


### Improving search results : Improving data 

In [58]:
files = pd.read_csv('course_section_descriptions.csv', encoding='cp1252')
files.head()    

Unnamed: 0,course_id,course_name,course_slug,course_description,course_description_short,course_technology,course_topic,course_instructor_quote,section_id,section_name,section_description
0,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,9,Introduction to Tableau,While Tableau is an indispensable tool in the ...
1,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,10,Tableau Functionalities,"In this section, you will create your first Ta..."
2,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,11,The Tableau Exercise,This section is a practical example that will ...
3,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,12,Introduction,"In this section, you will learn about the impo..."
4,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,13,Setting Up the Environments,"Here, we set up different environments for the..."


In [59]:
files['unique_id'] = files['course_id'].astype(str) + '_' + files['section_id'].astype(str)

In [60]:
files['metadata'] = files.apply(lambda row: {
    'course_name': row['course_name'],
    'section_name': row['section_name'],
    'section_description': row['section_description'],
}, axis=1)

In [61]:
def create_course_section_description(row):
    combined_text = f"""The course name is {row['course_name']}, 
    the course technology is {row['course_technology']}, 
    the course description is {row['course_description']}, 
    the section name is {row['section_name']}, 
    and the section description is {row['section_description']}."""
    return model.encode(combined_text, show_progress_bar=False)


In [62]:
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 2376.15it/s, Materializing param=pooler.dense.weight]                        
[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/multi-qa-mpnet-base-dot-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [63]:
files['embeddings'] = files.apply(create_course_section_description, axis=1)

In [67]:
vectors_to_upsert = [(row['unique_id'], row['embeddings'].tolist(), row['metadata']) for index, row in files.iterrows()]
index.upsert(vectors=vectors_to_upsert)

print("Data upserted successfully")

Data upserted successfully


In [72]:
query_results = index.query(
    vector=[query_embedding],
    top_k=12,
    include_values=True,
    include_metadata=True
)
print(query_results)

QueryResponse(matches=[{'id': '37_373',
 'metadata': {'course_name': 'Machine Learning in Python',
              'section_description': 'In this section, you will learn how to '
                                     'do Cluster analysis. Cluster analysis '
                                     'consists in dividing your data into '
                                     'separate groups based on an algorithm. '
                                     'Clustering is an amazing technique often '
                                     'employed in data science. But what’s '
                                     'more, it makes much more sense to study '
                                     'patterns observed in a particular group '
                                     'rather than trying to find patterns in '
                                     'the entire dataset. We will provide '
                                     'several practical examples that will '
                                     'h

In [69]:
score_threshold = 0.3

In [74]:
for match in query_results['matches']:
    if match['score'] > score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/a')
        section_name = course_details.get('section_name', 'N/a')
        section_description = course_details.get('section_description', 'N/a')
    
    print( f"Matched item ID: {match['id']}, Score: {match['score']}")
    print ( f"""Course Name: {course_name} 
    \nSection Name: {section_name} 
    \nSection Description: {section_description}""" + '\n' + '='*100)

Matched item ID: 37_373, Score: 0.474866033
Course Name: Machine Learning in Python 
    
Section Name: K-Means Clustering 
    
Section Description: In this section, you will learn how to do Cluster analysis. Cluster analysis consists in dividing your data into separate groups based on an algorithm. Clustering is an amazing technique often employed in data science. But what’s more, it makes much more sense to study patterns observed in a particular group rather than trying to find patterns in the entire dataset. We will provide several practical examples that will help you understand how to carry out cluster analysis and the difference between classification and clustering.
Matched item ID: 101_714, Score: 0.470567584
Course Name: The Machine Learning Algorithms A-Z 
    
Section Name: Hierarchical Clustering 
    
Section Description: Hierarchical clustering is similar to how you organize files into folders on your computer. Whenever we organize our files into their folders, we perfo