In [1]:
# KMEANS
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import json

# Load course embeddings from JSON
def load_embeddings(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# Load embeddings
embedding_file = "./data/with_sents/course_embed_pos_sent.json"
course_data = load_embeddings(embedding_file)

# Extract course codes and their embeddings
course_codes = [course["course_code"] for course in course_data]
course_embeddings = np.array([course["embedding"] for course in course_data])

# Step 1: Standardize the course embeddings
scaler = StandardScaler()
course_embeddings_scaled = scaler.fit_transform(course_embeddings)

# Step 2: Select one course as the initial centroid for each cluster
n_clusters = 5  # Number of clusters
initial_centroids = course_embeddings_scaled[:n_clusters]  # Take first N course embeddings as initial centroids

# Step 3: Perform K-means clustering with predefined initial centroids
kmeans = KMeans(n_clusters=n_clusters, init=initial_centroids, n_init=1, random_state=42)
kmeans.fit(course_embeddings_scaled)

# Step 4: Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Step 5: Ensure centroids correspond to course embeddings by assigning the closest course to each centroid
closest_courses = []
for centroid in centroids:
    distances = np.linalg.norm(course_embeddings_scaled - centroid, axis=1)  # Calculate the Euclidean distance from centroid to each course
    closest_course_idx = np.argmin(distances)  # Find the index of the closest course
    closest_courses.append(course_codes[closest_course_idx])  # Store the course code of the closest course

# Step 6: Add the cluster labels and centroid course codes to the course data
course_clusters = pd.DataFrame({
    'Course Code': course_codes,
    'Cluster': cluster_labels
})

# Add a new column "Centroid course code" with the closest course code for each cluster
course_clusters['Centroid course code'] = [closest_courses[label] for label in course_clusters['Cluster']]

# Step 7: Print the course clusters DataFrame
#print(course_clusters)
course_clusters.to_json("./data/with_sents/course_clusters.json", orient='records', lines=True)


In [2]:
import numpy as np
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Function to embed user query
def embed_query(query, model):
    return model.encode(query, convert_to_numpy=True)

# Load course cluster data (from a JSON file format similar to the one you provided)
def load_cluster_data(file_path):
    centroids = set()  # To keep track of unique centroid course codes
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            cluster_data = json.loads(line.strip())  # Each line is a JSON object
            centroids.add(cluster_data["Centroid course code"])
    return list(centroids)  # Return the unique centroids as a list

# Load course embeddings (from a JSON file format similar to the one you provided)
def load_embeddings(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Extract the embeddings for the unique centroids
def get_centroid_embeddings(cluster_file_path, embedding_file_path):
    # Load cluster data to get the unique centroid course codes
    centroid_courses = load_cluster_data(cluster_file_path)
    
    # Load course embeddings
    embeddings_data = load_embeddings(embedding_file_path)
    
    # Create a dictionary to store the centroid course code and corresponding embeddings
    centroids_embeddings = {}
    
    # Map embeddings to centroid course codes
    for course in embeddings_data:
        if course["course_code"] in centroid_courses:
            centroids_embeddings[course["course_code"]] = course["embedding"]
    
    return centroids_embeddings


# Get the embeddings of the cluster centroids
cluster_file = "data/with_sents/course_clusters.json"
embedding_file = "data/with_sents/course_embed_pos_sent.json"
centroid_embeddings = get_centroid_embeddings(cluster_file, embedding_file)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Get the user query
user_query = input("Enter your query: ")  # Example: "Machine Learning and AI"
query_embedding = embed_query(user_query, model)

# Step 1: Compute cosine similarity between the query embedding and all centroid embeddings
centroid_embeddings_array = list(centroid_embeddings.values())  # Extract the list of centroid embeddings
centroid_course_codes = list(centroid_embeddings.keys())  # Corresponding course codes of centroids

# Compute cosine similarity (query_embedding should be compared to each centroid embedding)
similarity_scores = cosine_similarity([query_embedding], centroid_embeddings_array)[0]

# Step 2: Get the top k most similar centroids
k = 5  # Define the number of top centroids you want
top_k_indices = np.argsort(similarity_scores)[-k:][::-1]  # Indices of the top k highest similarity scores

# Step 3: Print the top k most similar centroids along with their similarity scores
print(f"Top {k} most similar centroids to your query:")
for idx in top_k_indices:
    course_code = centroid_course_codes[idx]
    score = similarity_scores[idx]
    print(f"Course Code: {course_code}, Similarity Score: {score:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Top 5 most similar centroids to your query:
Course Code: INFR11223, Similarity Score: 0.2442
Course Code: INFR10086, Similarity Score: 0.1974
Course Code: INFR08025, Similarity Score: 0.0637
Course Code: INFR11244, Similarity Score: 0.0545
Course Code: INFR11077, Similarity Score: 0.0082
