In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from lib.preprocess import *
from openai import OpenAI
from dotenv import load_dotenv
from transformers import BertModel, BertTokenizer
import torch
import pinecone
import os
import numpy as np

# Load Data

In [23]:
df = load_data('data/all_minus_med.csv')
df.head()

Unnamed: 0,Course ID,Class Nbr,Subject,Catalog,Descr,PI Name,Course Long Descr,Section,Component,Mode,...,Mtg Start,Mtg End,Pat,Start Date,End Date,Descr 1,Attribute Formal Desc,Term Descr,Career,Location
0,19,8275,AMES,165S,THE WORLD OF JAPANESE POP CULT,"Maude,Daryl J",An examination of modern Japanese culture thro...,1,SEM,In Person,...,11:45:00.000000AM,1:00:00.000000PM,TTH,8/28/23,12/8/23,Languages 207,"(ALP) Arts, Literature & Performance",2023 Fall Term,UGRD,DURHAM
1,19,8275,AMES,165S,THE WORLD OF JAPANESE POP CULT,"Maude,Daryl J",An examination of modern Japanese culture thro...,1,SEM,In Person,...,11:45:00.000000AM,1:00:00.000000PM,TTH,8/28/23,12/8/23,Languages 207,(CCI) Cross Cultural Inquiry,2023 Fall Term,UGRD,DURHAM
2,19,8275,AMES,165S,THE WORLD OF JAPANESE POP CULT,"Maude,Daryl J",An examination of modern Japanese culture thro...,1,SEM,In Person,...,11:45:00.000000AM,1:00:00.000000PM,TTH,8/28/23,12/8/23,Languages 207,(CZ) Civilizations,2023 Fall Term,UGRD,DURHAM
3,19,8275,AMES,165S,THE WORLD OF JAPANESE POP CULT,"Maude,Daryl J",An examination of modern Japanese culture thro...,1,SEM,In Person,...,11:45:00.000000AM,1:00:00.000000PM,TTH,8/28/23,12/8/23,Languages 207,Crosslisted in another department,2023 Fall Term,UGRD,DURHAM
4,19,8275,AMES,165S,THE WORLD OF JAPANESE POP CULT,"Maude,Daryl J",An examination of modern Japanese culture thro...,1,SEM,In Person,...,11:45:00.000000AM,1:00:00.000000PM,TTH,8/28/23,12/8/23,Languages 207,Seminar,2023 Fall Term,UGRD,DURHAM


# Transform

## Clean data

In [24]:
# Remove all rows with duplicate 'Course ID' values
df = df.drop_duplicates(subset=['Course ID'], keep='first')

# Combine Descr and Course Long Descr into the same column
df['Description'] = df['Descr'] + ' ' + df['Course Long Descr']

# Drop all columns except 'Course ID' and 'Description'
columns = df.columns.tolist()
columns.remove('Course ID')
columns.remove('Description')
df = df.drop(columns, axis=1)

# Drop all rows with empty 'Description' values
df = df.dropna(subset=['Description'])

print(df.shape)
df.head()

(3751, 2)


Unnamed: 0,Course ID,Description
0,19,THE WORLD OF JAPANESE POP CULT An examination ...
12,22,SP TOP INT'L LIT & CULTURE Topics in internati...
16,40,"MASTERS OF CHINESE CINEMA Films, documentaries..."
31,50,SENIOR HONORS THESIS SEMINAR Required for AMES...
35,64,FIRST-YEAR SEMINAR (TOP) Topics vary each seme...


### Embed data
We will use OpenAI's embeddings API to embed our text.

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertModel.from_pretrained("bert-base-uncased")

def tokenize_course_desc(df, tokenizer, model):
    descriptions = df['Description'].tolist()
    tokenized_desc = tokenizer(descriptions, padding=True, truncation=True, return_tensors="pt")

    return tokenized_desc

tokenized_desc = tokenize_course_desc(df, tokenizer, model)

In [27]:
# Disable gradient calculations for efficiency
with torch.no_grad():
    # Get model outputs
    outputs = model(**tokenized_desc)

# The 'outputs' is a tuple where the first item contains the last hidden states
last_hidden_states = outputs.last_hidden_state

In [28]:
print(last_hidden_states.shape)

torch.Size([3751, 318, 768])


# Load

In [32]:
# Compute the mean of embeddings across the token dimension, excluding padding (token ID 0)
mean_embeddings = torch.mean(last_hidden_states, dim=1)

### Connect to the Pinecone Index

In [33]:
# Load into pinecone vector database
load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API_KEY')
environment = 'gcp-starter'

pinecone.init(api_key=pinecone_api_key, environment=environment)

# Check if the index exists, and if not, create one
index_name = "duke-course-desc"
if index_name not in pinecone.list_indexes():
    print("Index not found in Pinecone. Creating the index:", index_name)
    pinecone.create_index(index_name, dimension=768, metric="cosine")

# Connect to the index
index = pinecone.Index(index_name)

Index not found in Pinecone. Creating the index: duke-course-desc


In [35]:
# Convert mean embeddings to list of lists (if not already in that format)
vectors = mean_embeddings.tolist()

# Create unique IDs for each embedding
ids = [str(id) for id in df['Course ID'].tolist()]

# Upload to Pinecone in batches
batch_size = 100
for i in range(0, len(vectors), batch_size):
    batch_vectors = vectors[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]

    # Pair each ID with its vector and convert to list
    data_to_upsert = list(zip(batch_ids, batch_vectors))

    # Upsert data to Pinecone
    index.upsert(vectors=data_to_upsert)

## Test

### Fetching a specific vector

In [38]:
# Fetch a small sample of IDs
sample_ids = ids[:10]

# Fetch vectors by IDs
fetched_vectors = index.fetch(ids=sample_ids)

# Check the fetched data
print(fetched_vectors)

{'namespace': '',
 'vectors': {'175': {'id': '175',
                     'values': [-0.156429932,
                                -0.192072436,
                                0.281575084,
                                0.0997870862,
                                0.38972497,
                                -0.274953336,
                                0.275553614,
                                0.301157355,
                                0.0374579765,
                                -0.112019189,
                                -0.268330246,
                                0.044727508,
                                0.433153689,
                                0.0685805529,
                                0.0566573665,
                                -0.0278876554,
                                0.215468064,
                                -0.134800971,
                                -0.277184695,
                                0.0213697962,
                                0.0

### Perform a test query

In [42]:
# Tokenize the query
query = "classes that teach about machine learning"
inputs = tokenizer(query, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Use mean pooling to get a single vector (assuming using mean pooling for your data)
query_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

# Query the index
query_results = index.query(vector=[query_embedding.tolist()[0]], top_k=5)

print(query_results)

{'matches': [{'id': '26780', 'score': 0.708513, 'values': []},
             {'id': '27041', 'score': 0.701689899, 'values': []},
             {'id': '24542', 'score': 0.700408936, 'values': []},
             {'id': '27902', 'score': 0.700024068, 'values': []},
             {'id': '27456', 'score': 0.698934078, 'values': []}],
 'namespace': ''}


In [61]:
# Fetch the course IDs from the query results
course_ids = [result['id'] for result in query_results['matches']]
course_ids = np.array(course_ids)
course_ids = course_ids.astype(np.int64)

# find course_ids description in df
df[df['Course ID'].isin(course_ids)]

Unnamed: 0,Course ID,Description
31649,24542,DECISION ANALYTICS & MODELING Successful manag...
36562,26780,INTRO TO DEEP LEARNING Provides an introductio...
36915,27041,DEEP LEARNING APPS This course focuses on the ...
38291,27456,"DATA SCIENCE COMPETITION In this course, stude..."
39330,27902,HUMAN-CENTERED COMPUTING This course addresses...
