## COURSES VECTOR STORE

In [19]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer


In [4]:
files = pd.read_csv("course_descriptions.csv", encoding="cp1252")

In [5]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
    the technology is {row['course_technology'] } and the course topic is {row['course_topic']}
'''

In [7]:
files['course_description_new'] = files.apply(create_course_description, axis=1)
print(files['course_description_new'])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
                             ...                        
101    The course name is Intro to NLP for AI, the sl...
102    The course name is Data Analysis with ChatGPT,...
103    The course name is ChatGPT for Data Science, t...
104    The course name is Intro to LLMs, the slug is ...
105    The course name is Growth Analysis with SQL, P...
Name: course_description_new, Length: 106, dtype: object


In [11]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [12]:
load_dotenv(find_dotenv(), override=True)

True

In [14]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [15]:
index_name = "ben-start-index"
dimension = 384
metric = "cosine"

In [16]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully delete.")

else:
    print(f"{index_name} not in index list.")


ben-start-index successfully delete.


In [17]:
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric=metric,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "ben-start-index",
    "metric": "cosine",
    "host": "ben-start-index-i2oc4nb.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [18]:
index = pc.Index(index_name)

## Embedding the data


In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar=False)
    return embedding

In [23]:
files['embedding'] = files.apply(create_embeddings, axis=1)

In [24]:
files.head()

Unnamed: 0,course_name,course_slug,course_technology,course_description,course_topic,course_description_short,course_description_new,course_description_new.1,embedding
0,Introduction to Tableau,tableau,tableau,Tableau is now one of the most popular busines...,data visualization,Teaching you how to tell compelling stories wi...,"The course name is Introduction to Tableau, th...","The course name is Introduction to Tableau, th...","[0.03636874, -0.027420036, -0.081906565, -0.00..."
1,The Complete Data Visualization Course with Py...,data-visualization,python,The Data Visualization course is designed for ...,data visualization,Teaching you how to master the art of creating...,The course name is The Complete Data Visualiza...,The course name is The Complete Data Visualiza...,"[0.046510335, -0.020395765, -0.018906748, -0.0..."
2,Introduction to R Programming,introduction-to-r-programming,r,R is one of the best programming languages spe...,programming,"Providing you with the skills to manipulate, a...",The course name is Introduction to R Programmi...,The course name is Introduction to R Programmi...,"[-0.043807026, 0.0059339027, -0.088446684, 0.0..."
3,Data Preprocessing with NumPy,data-preprocessing-numpy,python,This course is designed to show you how to wor...,data processing,This course will guide you through one of Pyth...,The course name is Data Preprocessing with Num...,The course name is Data Preprocessing with Num...,"[-0.056979477, -0.008179314, -0.039837625, -0...."
4,Introduction to Data and Data Science,intro-to-data-and-data-science,theory,Working with data is an essential part of main...,machine learning,Introducing you to the field of data science a...,The course name is Introduction to Data and Da...,The course name is Introduction to Data and Da...,"[-0.021475827, 0.020705566, -0.0025892651, 0.0..."


In [25]:
vectors_to_upsert = [(str(row['course_name']), row['embedding'].tolist()) for _, row in files.iterrows()]

index.upsert(vectors=vectors_to_upsert)

print('Data upserted to Pinecone Index ', index_name)

Data upserted to Pinecone Index  ben-start-index


## Semantic Search

In [27]:
query = 'clustering'
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [28]:
query_result = index.query(
    vector=[query_embedding],
    top_k=12,
    include_values=True
)

In [29]:
query_result

{'matches': [{'id': 'Machine Learning in Excel',
              'score': 0.354685158,
              'values': [-0.0183002409,
                         -0.0279485714,
                         -0.0253203325,
                         -0.0126938904,
                         -0.0240366142,
                         -0.0219840948,
                         -0.0511236973,
                         -0.0535800196,
                         0.00997652,
                         0.0282286443,
                         -0.040832486,
                         -0.0362686813,
                         0.0683277473,
                         -0.0348471776,
                         -0.00728513,
                         0.0366663,
                         -0.00331014162,
                         -0.00411821203,
                         -4.75362467e-05,
                         -0.0627968907,
                         0.0846960247,
                         0.0300105233,
                         -0.05283048,
       

In [30]:
for match in query_result['matches']:
    print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: Machine Learning in Excel, score: 0.354685158
Matched item ID: Machine Learning with K-Nearest Neighbors, score: 0.313649148
Matched item ID: Machine Learning in Python, score: 0.282944202
Matched item ID: Customer Churn Analysis with SQL and Tableau, score: 0.281879693
Matched item ID: Growth Analysis with SQL, Python, and Tableau  , score: 0.26017347
Matched item ID: Linear Algebra and Feature Selection, score: 0.258531868
Matched item ID: Customer Engagement Analysis with SQL and Tableau, score: 0.234880671
Matched item ID: Fashion Analytics with Tableau, score: 0.233549252
Matched item ID: Machine Learning with Naive Bayes, score: 0.22770682
Matched item ID: Machine Learning with Support Vector Machines, score: 0.225507736
Matched item ID: Data Preprocessing with NumPy, score: 0.219204426
Matched item ID: Data Analysis with Excel Pivot Tables, score: 0.21714893
