In [9]:
# KMEANS
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import json

# Load course embeddings from JSON
def load_embeddings(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# Load embeddings
embedding_file = "./data/with_sents/course_embed_pos_sent.json"
course_data = load_embeddings(embedding_file)

# Extract course codes and their embeddings
course_codes = [course["course_code"] for course in course_data]
course_embeddings = np.array([course["embedding"] for course in course_data])

# Step 1: Standardize the course embeddings
scaler = StandardScaler()
course_embeddings_scaled = scaler.fit_transform(course_embeddings)

# Step 2: Select one course as the initial centroid for each cluster
n_clusters = 20  # Number of clusters
initial_centroids = course_embeddings_scaled[:n_clusters]  # Take first N course embeddings as initial centroids

# Step 3: Perform K-means clustering with predefined initial centroids
kmeans = KMeans(n_clusters=n_clusters, init=initial_centroids, n_init=1, random_state=42)
kmeans.fit(course_embeddings_scaled)

# Step 4: Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Step 5: Ensure centroids correspond to course embeddings by assigning the closest course to each centroid
closest_courses = []
for centroid in centroids:
    distances = np.linalg.norm(course_embeddings_scaled - centroid, axis=1)  # Calculate the Euclidean distance from centroid to each course
    closest_course_idx = np.argmin(distances)  # Find the index of the closest course
    closest_courses.append(course_codes[closest_course_idx])  # Store the course code of the closest course

# Step 6: Add the cluster labels and centroid course codes to the course data
course_clusters = pd.DataFrame({
    'Course Code': course_codes,
    'Cluster': cluster_labels
})

# Add a new column "Centroid course code" with the closest course code for each cluster
course_clusters['Centroid course code'] = [closest_courses[label] for label in course_clusters['Cluster']]

# Step 7: Print the course clusters DataFrame
#print(course_clusters)
course_clusters.to_json("./data/with_sents/course_clusters.json", orient='records', lines=True)


In [11]:
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Function to embed user query
def embed_query(query, model):
    return model.encode(query, convert_to_numpy=True)

# Load course cluster data (from a JSON file format similar to the one you provided)
def load_cluster_data(file_path):
    clusters = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            cluster_data = json.loads(line.strip())  # Each line is a JSON object
            cluster_code = cluster_data["Cluster"]
            course_code = cluster_data["Course Code"]
            centroid_code = cluster_data["Centroid course code"]

            # Group courses by their cluster
            if cluster_code not in clusters:
                clusters[cluster_code] = {"centroid": centroid_code, "courses": []}
            clusters[cluster_code]["courses"].append(course_code)
    
    return clusters  # Return the clusters with courses and centroid information

# Load course embeddings (from a JSON file format similar to the one you provided)
def load_embeddings(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Extract the embeddings for the unique centroids
def get_centroid_embeddings(cluster_file_path, embedding_file_path):
    clusters = load_cluster_data(cluster_file_path)
    
    # Load course embeddings
    embeddings_data = load_embeddings(embedding_file_path)
    
    # Create a dictionary to store the centroid course code and corresponding embeddings
    centroids_embeddings = {}
    
    # Map embeddings to centroid course codes
    for course in embeddings_data:
        if course["course_code"] in [cluster["centroid"] for cluster in clusters.values()]:
            centroids_embeddings[course["course_code"]] = course["embedding"]
    
    return centroids_embeddings, clusters


# Get the embeddings of the cluster centroids and cluster info
cluster_file = "data/with_sents/course_clusters.json"
embedding_file = "data/with_sents/course_embed_pos_sent.json"
centroid_embeddings, clusters = get_centroid_embeddings(cluster_file, embedding_file)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Get the user query
user_query = input("Enter your query: ")  # Example: "Machine Learning and AI"
query_embedding = embed_query(user_query, model)

# Step 1: Compute cosine similarity between the query embedding and all centroid embeddings
centroid_embeddings_array = list(centroid_embeddings.values())  # Extract the list of centroid embeddings
centroid_course_codes = list(centroid_embeddings.keys())  # Corresponding course codes of centroids

# Compute cosine similarity (query_embedding should be compared to each centroid embedding)
similarity_scores = cosine_similarity([query_embedding], centroid_embeddings_array)[0]

# Step 2: Get the top 2 most similar centroids
k = 2  # We only want the top 2 similar centroids
top_k_indices = np.argsort(similarity_scores)[-k:][::-1]  # Indices of the top k highest similarity scores

# Step 3: Print the courses corresponding to the clusters of the top 2 most similar centroids
print(f"Top {k} most similar centroids to your query:")

# Print the courses of the top 2 clusters
for idx in top_k_indices:
    course_code = centroid_course_codes[idx]
    score = similarity_scores[idx]
    print(f"\nCentroid Course Code: {course_code}, Similarity Score: {score:.4f}")
    
    # Find the cluster for this centroid and print its courses
    for cluster_code, cluster_data in clusters.items():
        if cluster_data["centroid"] == course_code:
            print(f"  Cluster Courses: {', '.join(cluster_data['courses'])}")


Top 2 most similar centroids to your query:

Centroid Course Code: INFR10061, Similarity Score: 0.4037
  Cluster Courses: INFR08025, INFR10065, INFR10061, INFR11023, INFR11114

Centroid Course Code: INFR08020, Similarity Score: 0.3950
  Cluster Courses: INFR08020, INFR10078, INFR11125, INFR11033, INFR11209, INFR11212, INFR11194, INFR11140, INFR11157, INFR11210


In [14]:
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Function to embed user query
def embed_query(query, model):
    return model.encode(query, convert_to_numpy=True)

# Load course cluster data (from a JSON file format similar to the one you provided)
def load_cluster_data(file_path):
    clusters = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            cluster_data = json.loads(line.strip())  # Each line is a JSON object
            cluster_code = cluster_data["Cluster"]
            course_code = cluster_data["Course Code"]
            centroid_code = cluster_data["Centroid course code"]

            # Group courses by their cluster
            if cluster_code not in clusters:
                clusters[cluster_code] = {"centroid": centroid_code, "courses": []}
            clusters[cluster_code]["courses"].append(course_code)
    
    return clusters  # Return the clusters with courses and centroid information

# Load course embeddings (from a JSON file format similar to the one you provided)
def load_embeddings(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Extract the embeddings for the unique centroids
def get_centroid_embeddings(cluster_file_path, embedding_file_path):
    clusters = load_cluster_data(cluster_file_path)
    
    # Load course embeddings
    embeddings_data = load_embeddings(embedding_file_path)
    
    # Create a dictionary to store the centroid course code and corresponding embeddings
    centroids_embeddings = {}
    
    # Map embeddings to centroid course codes
    for course in embeddings_data:
        if course["course_code"] in [cluster["centroid"] for cluster in clusters.values()]:
            centroids_embeddings[course["course_code"]] = course["embedding"]
    
    return centroids_embeddings, clusters


# Get the embeddings of the cluster centroids and cluster info
cluster_file = "data/with_sents/course_clusters.json"
embedding_file = "data/with_sents/course_embed_pos_sent.json"
centroid_embeddings, clusters = get_centroid_embeddings(cluster_file, embedding_file)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Get the user query
user_query = input("Enter your query: ")  # Example: "Machine Learning and AI"
query_embedding = embed_query(user_query, model)

user_query = input("Enter your query: ")  # Example: "Machine Learning and AI"
query_embedding1 = embed_query('I love Natural Language Processing', model)
query_embedding2 = embed_query('I Hate Machine Learning Practica', model)

query_embedding = query_embedding1 - query_embedding2


# Step 1: Compute cosine similarity between the query embedding and all centroid embeddings
centroid_embeddings_array = list(centroid_embeddings.values())  # Extract the list of centroid embeddings
centroid_course_codes = list(centroid_embeddings.keys())  # Corresponding course codes of centroids

# Compute cosine similarity (query_embedding should be compared to each centroid embedding)
similarity_scores = cosine_similarity([query_embedding], centroid_embeddings_array)[0]

# Step 2: Get the top 2 most similar centroids
k = 2  # We only want the top 2 similar centroids
top_k_indices = np.argsort(similarity_scores)[-k:][::-1]  # Indices of the top k highest similarity scores

# Step 3: Collect the courses in the top-2 clusters
courses_in_top_clusters = []
for idx in top_k_indices:
    course_code = centroid_course_codes[idx]
    
    # Find the cluster for this centroid and get its courses
    for cluster_code, cluster_data in clusters.items():
        if cluster_data["centroid"] == course_code:
            courses_in_top_clusters.extend(cluster_data["courses"])

# Step 4: Get embeddings for courses in the top-2 clusters
courses_embeddings = {course["course_code"]: course["embedding"] for course in load_embeddings(embedding_file) if course["course_code"] in courses_in_top_clusters}

# Step 5: Compute cosine similarity between the query_embedding and all course embeddings in the top-2 clusters
course_embeddings_array = list(courses_embeddings.values())  # Extract the list of course embeddings
course_codes = list(courses_embeddings.keys())  # Corresponding course codes of the courses in the top-2 clusters

# Compute cosine similarity (query_embedding should be compared to each course embedding)
course_similarity_scores = cosine_similarity([query_embedding], course_embeddings_array)[0]

# Step 6: Get the top 10 most similar courses
top_10_indices = np.argsort(course_similarity_scores)[-10:][::-1]  # Indices of the top 10 highest similarity scores

# Step 7: Print the top 10 most similar courses along with their similarity scores
print(f"\nTop 10 most similar courses to your query:")
for idx in top_10_indices:
    course_code = course_codes[idx]
    score = course_similarity_scores[idx]
    print(f"Course Code: {course_code}, Similarity Score: {score:.4f}")



Top 10 most similar courses to your query:
Course Code: INFR10078, Similarity Score: 0.2877
Course Code: INFR11157, Similarity Score: 0.2810
Course Code: INFR11125, Similarity Score: 0.2726
Course Code: INFR11194, Similarity Score: 0.2566
Course Code: INFR08020, Similarity Score: 0.1062
Course Code: INFR09032, Similarity Score: 0.0695
Course Code: INFR11017, Similarity Score: 0.0583
Course Code: INFR11033, Similarity Score: 0.0580
Course Code: INFR11210, Similarity Score: 0.0007
Course Code: INFR11209, Similarity Score: -0.0130
