In [2]:
%load_ext dotenv
%dotenv

In [31]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
import pinecone
from sentence_transformers import SentenceTransformer

In [7]:
files = pd.read_csv("./course_descriptions.csv", encoding = "latin1")

In [9]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
    the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [11]:
files['course_description_new'] = files.apply(create_course_description, axis=1)
print(files['course_description_new'])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
                             ...                        
101    The course name is Intro to NLP for AI, the sl...
102    The course name is Data Analysis with ChatGPT,...
103    The course name is ChatGPT for Data Science, t...
104    The course name is Intro to LLMs, the slug is ...
105    The course name is Growth Analysis with SQL, P...
Name: course_description_new, Length: 106, dtype: object


In [15]:
pc = Pinecone()

In [21]:
index_name = "intro-index"
dimension = 384
metric = "cosine"

In [23]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

intro-index succesfully deleted.


In [27]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

{
    "name": "intro-index",
    "metric": "cosine",
    "host": "intro-index-ke2mvmk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [29]:
index = pc.Index(index_name)

In [33]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [35]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [37]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

In [39]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)

print("Data upserted to Pinecone index")

Data upserted to Pinecone index


In [41]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar = False).tolist()

In [43]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12, 
    include_values = True
)

In [45]:
query_results

{'matches': [{'id': 'Machine Learning in Excel',
              'score': 0.354455769,
              'values': [-0.0183002669,
                         -0.0279485248,
                         -0.0253203679,
                         -0.0126938932,
                         -0.0240365937,
                         -0.0219840705,
                         -0.0511236973,
                         -0.0535800196,
                         0.00997656118,
                         0.0282286555,
                         -0.0408324711,
                         -0.0362686664,
                         0.0683277175,
                         -0.0348471291,
                         -0.00728514278,
                         0.0366663039,
                         -0.00331018679,
                         -0.00411816547,
                         -4.75787783e-05,
                         -0.0627969131,
                         0.08469598,
                         0.0300105,
                         -0.0528304428,


In [49]:
score_threshold = 0.3
for match in query_results["matches"]:
    if match['score'] >= score_threshold:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: Machine Learning in Excel, score: 0.354455769
Matched item ID: Machine Learning with K-Nearest Neighbors, score: 0.319466621
