## Setting up Open AI Connection

In [1]:
!pip install -U openai pinecone-client datasets pandas numpy tiktoken sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-1.24.2-cp39-cp39-many

In [19]:
## Constants

OPENAI_API_KEY = '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'

In [20]:
import openai
openai.api_key = OPENAI_API_KEY
# get API key from top-right dropdown on OpenAI website

# openai.Engine.list()  # check we have authenticated

In [21]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)

In [22]:
# extract embeddings to a list
embeds = [record['embedding'] for record in res['data']]

## Converting CSV data into embeddings

In [6]:
import pandas as pd

csv_url = 'https://docs.google.com/spreadsheets/d/10xvFQVZdRxfxCS1of71mN0akur-9e7lsM8yEogZCrMs/pub?gid=0&single=true&output=csv'

# Load the CSV data
# data = pd.read_csv(csv_url)
data = pd.read_csv('/content/course-ner-data.csv')
data = data.fillna('')
data.head(5)

# Combine important text info into 1 column

data['text'] = data['name'] + data['tags']
# Set the maximum allowed length for a given column
max_length = 16384
column_name = 'text'

# Truncate the values in the specified column
data[column_name] = data[column_name].apply(lambda x: x[:max_length] if len(x) > max_length else x)

In [7]:
# def get_embeddings(texts, model="text-davinci-002"):
#     embeddings = []
#     batch_size = 128

#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i + batch_size]
#         prompt = "\n".join(batch_texts)
#         completions = openai.Completion.create(
#             engine=model,
#             prompt=prompt,
#             max_tokens=22,
#             n=len(batch_texts),
#             stop=None,
#             temperature=0,
#             logprobs=0,
#             echo=True
#         )
#         # batch_embeddings = [completion['choices'][0]['embeddings']['mean'] for completion in completions['choices']]
#         # embeddings.extend(batch_embeddings)
#         print(completions)

#     return embeddings

# # Generate embeddings for the courses
# embeddings = get_embeddings(data['text'])

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

data['ada_embedding'] = data['text'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

In [8]:
data['ada_embedding'].head()

0    [0.01963328756392002, 0.010710296221077442, 0....
1    [0.009129764512181282, 0.0003475684206932783, ...
2    [-0.009399139322340488, 0.007830390706658363, ...
3    [-0.0007247348548844457, 0.00911095179617405, ...
4    [-0.028600748628377914, -0.019169151782989502,...
Name: ada_embedding, dtype: object

In [9]:
data.to_csv('embedded_data.csv', index=False)

## Calculate similarity between courses

In [30]:
import tiktoken

def count_tokens(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

count_tokens("tiktoken is great!", "cl100k_base")

6

In [10]:
embeddings = data['ada_embedding']

In [11]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [12]:
def find_similar_courses(course_id, embeddings, threshold=0.5):
    target_embedding = embeddings[course_id]
    similarities = []

    for idx, embedding in enumerate(embeddings):
        if idx == course_id:
            continue
        
        similarity = cosine_similarity(target_embedding, embedding)
        
        if similarity > threshold:
            similarities.append((idx, similarity))

    # Sort by similarity score
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities

In [18]:
course_id = 3  # Replace with the desired course ID
similar_courses = find_similar_courses(course_id, embeddings)

# print(f"Similar courses for course ID {course_id}:")
# for idx, similarity in similar_courses:
#     print(f"Course ID: {idx}, Similarity: {similarity:.3f}")

Similar courses for course ID 3:


In [17]:
top_matches = [t[1] for t in similar_courses[:5]]
top_matches_idxs = [t[0] for t in similar_courses[:5]]
selected_rows_df = data.iloc[top_matches_idxs]
selected_rows_df

Unnamed: 0,id,courseContent,prerequisitesOfCurrentCourse,referenceBooks,textBooks,extractedPrereqCourseCodes,slots,courseCode,courseType,credits,deptCode,description,name,tags,text,ada_embedding
985,6265b98c2dec4c36901ee8a9,Principles of power flow control and voltage c...,,1. \tUnderstanding FACTS – Concepts and Techno...,1.\tFACTS Controllers in Power Transmission an...,,,EE5261,Theory,9,EE,1.\tTo expose the students to an emerging tech...,Flexible AC Transmission Systems-EE5261,Flexible AC Transmission Systems-EE5261; Princ...,Flexible AC Transmission Systems-EE5261Flexibl...,"[-0.010727227665483952, -0.008826367557048798,..."
265,6265b9922dec4c36901ee939,"Introduction to AC and DC microgrid systems, d...",B. Tech./DD with power electronics (EE3203),"1 Sudipta Chakraborty, Marcelo G. Simões, and ...","S.P. Chowdhury, P. Crossley, S. Chowdhury \Mic...",EE3203,,EE5262,Theory,9,EE,This course is a graduate level course for Ele...,Distributed Generation and Microgrid Systems-E...,Distributed Generation; Microgrid Systems; EE5...,Distributed Generation and Microgrid Systems-E...,"[-0.00839999970048666, -0.011146278120577335, ..."
1967,6265b9952dec4c36901ee993,") Power Converters, • Introduction, DC-DC Conv...",,"1) Issa Batarseh, Power Electronic Circuits, J...","1) R. W. Erickson and D. Maksimovic, “Fundamen...",,G,EE5203,Theory,9,EE,To familiarise students with the techniques of...,Switched Mode Power Conversion-EE5203,Power Converters; Introduction; DC-DC Converte...,Switched Mode Power Conversion-EE5203Power Con...,"[-0.0075479368679225445, -0.003704992355778813..."
1758,6265b98d2dec4c36901ee8bf,"Principles of Drives, Drive train methods – Ge...",POWER ELECTRONICS for BTech/DD students,"3.\tR. Krishnan, “Electric Motor Drives: Model...","1.\tW. Leonhard, Control of Electrical Drives,...",,,EE6200,Theory,9,EE,This course is a graduate level course for Ele...,Power Electronic Control of Electric Machines-...,Principles of Drives; Drive train methods – Ge...,Power Electronic Control of Electric Machines-...,"[-0.020906182006001472, -0.011193601414561272,..."
23,6265b9922dec4c36901ee943,Introduction to Power Systems: Historical deve...,EE2005,1.\tElectrical energy systems theory an introd...,1.\tPower system Analysis: John J. Grainger an...,EE2005,,EE3003,Theory,10,EE,This course is a first course on power systems...,Power Systems-EE3003,Power Systems; EE3003; Historical developments...,Power Systems-EE3003Power Systems; EE3003; His...,"[-0.0017959055257961154, -0.013223945163190365..."


In [36]:
selected_rows_df = data.iloc[[3,985, 397, 1254]]
selected_rows_df

Unnamed: 0,id,courseContent,prerequisitesOfCurrentCourse,referenceBooks,textBooks,extractedPrereqCourseCodes,slots,courseCode,courseType,credits,deptCode,description,name,tags,text,ada_embedding
3,6265b9972dec4c36901ee9cb,"Historical Developments, Applications of DC Tr...",,"1. C. Adamson and N.G. Hingorani, High Voltage...","1. E.W. Kimbark, Direct Current Transmission, ...",,B,EE6258,Theory,9,EE,1. To present a power transmission technology\...,DC Power Transmission Systems-EE6258,Historical Developments; Applications of DC Tr...,DC Power Transmission Systems-EE6258,"[-0.02118978649377823, -0.0140390545129776, -0..."
985,6265b98c2dec4c36901ee8a9,Principles of power flow control and voltage c...,,1. \tUnderstanding FACTS – Concepts and Techno...,1.\tFACTS Controllers in Power Transmission an...,,,EE5261,Theory,9,EE,1.\tTo expose the students to an emerging tech...,Flexible AC Transmission Systems-EE5261,Flexible AC Transmission Systems-EE5261; Princ...,Flexible AC Transmission Systems-EE5261,"[-0.005114557221531868, -0.004863351117819548,..."
397,6265b99a2dec4c36901eea17,Introduction to Protective Relays: Basics and...,EE3003 Electrical Power Systems,"1.\tLeslie Hewirson, Marl Brown, Ramesn Balakr...",1.\tPaul M. Anderson “Power System Protection”...,EE3003,K,EE6255,Theory,9,EE,1. To understand the fundamentals and basics o...,Power System Protection-EE6255,Power System Protection-EE6255;Protective Rela...,Power System Protection-EE6255,"[-0.011694437824189663, -0.01351387333124876, ..."
1254,6265b9952dec4c36901ee989,Introduction to power systems stability proble...,COT,"1. K. R. Padiyar, \Power systems dynamics: sta...","1. Peter W. Sauer and M. A. Pai, “Power system...",,,EE6253,Theory,9,EE,This course is an advanced level graduate cour...,Power System Control and Stability-EE6253,Power System Control and Stability-EE6253; Int...,Power System Control and Stability-EE6253,"[0.0002539408451411873, -0.016346782445907593,..."


In [29]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key="c2f3c69c-4276-4df4-9a91-95409d65b127",
    environment="northamerica-northeast1-gcp"  # find next to API key in console
)

# check if 'openai' index already exists (only create index if not)
if 'openai' not in pinecone.list_indexes():
    pinecone.create_index('openai', dimension=len(embeds[0]))
# connect to index
index = pinecone.Index('openai')

  from tqdm.autonotebook import tqdm


In [10]:
from datasets import load_dataset

# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')

Downloading builder script:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading and preparing dataset trec/default to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset trec downloaded and prepared to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2. Subsequent calls will reuse this data.


In [None]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(data['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(data['text']))
    # get batch of lines and IDs
    lines_batch = data['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))