## Courses and Sections Semantic Search BERT

In [1]:
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv(find_dotenv(), override=True)

True

In [4]:
files = pd.read_csv("course_section_descriptions.csv", encoding="cp1252")

In [5]:
files['unique_id'] = files['course_id'].astype(str) + '-' + files['section_id'].astype(str)

In [6]:
files['metadata'] = files.apply(
    lambda row: {
        'course_name': row['course_name'],
        'section_name': row['section_name'],
        'section_description': row['section_description'],
    }, axis=1
)

In [7]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('multi-qa-distilbert-cos-v1')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [8]:
def create_embeddings(row):
    combined_text = f'''
            {row['course_name']} {row['course_technology']},
            {row['course_description']} {row['section_name']}{row['section_description']} ,
            '''
    return model.encode(combined_text, show_progress_bar=False)

In [9]:
files['embedding'] = files.apply(create_embeddings, axis=1)

In [10]:
files.head()

Unnamed: 0,course_id,course_name,course_slug,course_description,course_description_short,course_technology,course_topic,course_instructor_quote,section_id,section_name,section_description,unique_id,metadata,embedding
0,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,9,Introduction to Tableau,While Tableau is an indispensable tool in the ...,2-9,"{'course_name': 'Introduction to Tableau', 'se...","[0.012094061, 0.048676707, 0.025640793, 0.0021..."
1,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,10,Tableau Functionalities,"In this section, you will create your first Ta...",2-10,"{'course_name': 'Introduction to Tableau', 'se...","[0.034901984, 0.040251367, 0.019586952, 0.0168..."
2,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,11,The Tableau Exercise,This section is a practical example that will ...,2-11,"{'course_name': 'Introduction to Tableau', 'se...","[0.03602686, 0.024582712, 0.017501818, 0.02485..."
3,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,12,Introduction,"In this section, you will learn about the impo...",3-12,{'course_name': 'The Complete Data Visualizati...,"[0.030950926, 0.03750276, 0.025479242, 0.07094..."
4,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,13,Setting Up the Environments,"Here, we set up different environments for the...",3-13,{'course_name': 'The Complete Data Visualizati...,"[0.013764884, 0.048881054, 0.021354705, 0.0487..."


In [11]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), environment= os.environ.get("PINECONE_ENV"))

In [21]:
index_name = "benji-bert-index"
dimension = 768
metric = "cosine"

In [19]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully delete.")

else:
    print(f"{index_name} not in index list.")


benji_bert_index not in index list.


In [22]:
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric=metric,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "benji-bert-index",
    "metric": "cosine",
    "host": "benji-bert-index-i2oc4nb.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [23]:
index = pc.Index(index_name)

In [24]:
vectors_to_upsert = [(row['unique_id'], row['embedding'].tolist(), row['metadata']) for index, row in files.iterrows()]

In [25]:
index.upsert(vectors=vectors_to_upsert)

print('Data upserted to Pinecone Index ', index_name)

Data upserted to Pinecone Index  benji-bert-index


## Semantic Search

In [26]:
query = 'regression'

query_embedding= model.encode(query, show_progress_bar=False).tolist()

In [27]:
query_results = index.query(
    vector=[query_embedding], top_k=6, include_metadata=True
)

In [28]:
query_results

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [29]:
score_threshold= 0.3

In [30]:
for match in query_results['matches']:

    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')

        print(f"Matched item ID: {match['id']}, score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")