# Install necessary packages

In [42]:
!pip install --upgrade pip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
!pip install torch numpy pandas scikit-learn plotly nltk transformers sentence-transformers einops datasets gradio networkx umap-learn ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# Import Libraries

In [44]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from IPython.display import display
import umap

# Download NLTK resources

In [45]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
# Define stopwords
stop_words = set(stopwords.words('french'))

def remove_stopwords(text):
    # Tokenize into words
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

In [47]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

In [48]:
def preprocess_text(text):
    # Remove stopwords
    no_stopwords = remove_stopwords(text)
    return no_stopwords

# Load MiniLM model and tokenizer for generating embeddings

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
# model = AutoModel.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
# # Move model to GPU if available for faster computation
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# Load the CSV files to inspect their contents

In [50]:
current_dir = os.getcwd() + '/notebook/data'

In [51]:
program_df = pd.read_csv(os.path.join(current_dir, 'Program.csv'))
program_course_df = pd.read_csv(os.path.join(current_dir, 'ProgramCourse.csv'))
program_type_df = pd.read_csv(os.path.join(current_dir, 'ProgramType.csv'))
course_df = pd.read_csv(os.path.join(current_dir, 'Course.csv'))

# Inspect the DataFrames

In [52]:
print("Program DataFrame Head:")
display(program_df.head())

Program DataFrame Head:


Unnamed: 0,code,credits,horaireCoursPdfJson,planificationPdfJson,createdAt,updatedAt,title,url,cycle,id
0,648,15 crédits,,,2024-10-17 04:32:25.190,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en géni...,https://www.etsmtl.ca/programmes-formations/pr...,2,183146
1,569,15 crédits,,,2024-10-17 04:32:25.191,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en géni...,https://www.etsmtl.ca/programmes-formations/pr...,2,183156
2,514,15 crédits,,,2024-10-17 04:32:25.191,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en gest...,https://www.etsmtl.ca/programmes-formations/pr...,2,183236
3,3294,30 crédits,,,2024-10-17 04:32:25.184,2024-11-02 01:50:20.408,DESS en projets internationaux et ingénierie g...,https://www.etsmtl.ca/programmes-formations/de...,2,183498
4,6646,"90 crédits, incluant 9 crédits de stage",,,2024-10-17 04:32:25.184,2024-11-02 01:50:20.408,Baccalauréat en informatique distribuée,https://www.etsmtl.ca/programmes-formations/ba...,1,182928


In [53]:
print("\nProgramCourse DataFrame Head:")
display(program_course_df.head())


ProgramCourse DataFrame Head:


Unnamed: 0,createdAt,updatedAt,typicalSessionIndex,courseId,programId,type
0,2024-11-02 01:50:21.887,2024-11-02 01:50:21.887,1,351029,183256,
1,2024-11-02 01:50:21.913,2024-11-02 01:50:21.913,3,353404,183040,TRONC
2,2024-11-02 01:50:21.958,2024-11-02 01:50:21.958,4,352245,182976,TRONC
3,2024-11-02 01:50:21.973,2024-11-02 01:50:21.973,4,351827,182976,TRONC
4,2024-11-02 01:50:21.981,2024-11-02 01:50:21.981,5,353458,182976,TRONC


In [54]:
print("\nProgramType DataFrame Head:")
display(program_type_df.head())


ProgramType DataFrame Head:


Unnamed: 0,id,title
0,697435,Maîtrise avec projet
1,738239,Microprogramme
2,697451,Maîtrise avec mémoire
3,915770,Concentration en technologies de la santé
4,697388,Doctorat


In [55]:
print("\nCourse DataFrame Head:")
display(course_df.head())


Course DataFrame Head:


Unnamed: 0,code,title,description,credits,createdAt,updatedAt,id,cycle
0,ATE800E,Academic Integrity : Concepts and Techniques,The ATE800 workshop must be passed in the firs...,0,2024-11-02 01:50:21.499,2024-11-02 01:50:21.569,407641,2
1,ELE735,Analyse numérique,"Au terme de ce cours, l'étudiante ou l'étudian...",3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,350543,1
2,MTI850,Analytiques des données massives,Ce cours présente les concepts pour effectuer ...,3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,353344,2
3,CHM015,Chimie préparatoire pour le génie (hors progra...,Ce cours vise à initier l’étudiante ou l'étudi...,3,2024-11-02 01:50:21.503,2024-11-02 01:50:21.773,349708,1
4,ELE752,Appareillage électrique,"Au terme de ce cours, l'étudiante ou l'étudian...",3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,350599,1


In [56]:
print("\nDataFrame Columns:")
print("Program:", program_df.columns.tolist())
print("ProgramCourse:", program_course_df.columns.tolist())
print("ProgramType:", program_type_df.columns.tolist())
print("Course:", course_df.columns.tolist())


DataFrame Columns:
Program: ['code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt', 'updatedAt', 'title', 'url', 'cycle', 'id']
ProgramCourse: ['createdAt', 'updatedAt', 'typicalSessionIndex', 'courseId', 'programId', 'type']
ProgramType: ['id', 'title']
Course: ['code', 'title', 'description', 'credits', 'createdAt', 'updatedAt', 'id', 'cycle']


# Define a Function to Generate Embeddings

In [57]:
# Initialize device outside the embedding function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [58]:
def generate_embeddings(text_list, tokenizer, model, batch_size=32):
    model.eval()  # Set model to evaluation mode
    embeddings = []
    with torch.no_grad():  # Disable gradient calculation
        for i in range(0, len(text_list), batch_size):
            batch_texts = text_list[i:i+batch_size]
            # Tokenize the input texts
            tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
            # Move tensors to the appropriate device (already set)
            tokens = {k: v.to(device) for k, v in tokens.items()}
            # Forward pass to get hidden states
            outputs = model(**tokens)
            # Simple mean pooling
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(batch_embeddings.cpu())
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings.numpy()

# Generate Embeddings for Programs

In [59]:
# List of columns to include
columns_to_include = [
    'title', 'code', 'cycle', 'url', 'id'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    program_df[col] = program_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
program_texts = program_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
program_embeddings = generate_embeddings(program_texts.tolist(), tokenizer, model)

# Add embeddings to program_df
program_df['vector'] = list(program_embeddings)


# Visualize Program Embeddings using PCA

In [60]:
pca = PCA(n_components=2)
program_pca_result = pca.fit_transform(program_embeddings)
program_df['pca-one-program'] = program_pca_result[:, 0]
program_df['pca-two-program'] = program_pca_result[:, 1]

# Plot the program embeddings
fig = px.scatter(
    program_df,
    x='pca-one-program',
    y='pca-two-program',
    hover_data=['title'],
    title='Program Embeddings PCA'
)
fig.show()


# Visualize Program Embeddings using t-SNE

In [61]:
tsne = TSNE(n_components=2, perplexity=30, max_iter=1000, random_state=42)
program_tsne_result = tsne.fit_transform(program_embeddings)
program_df['tsne-one-program'] = program_tsne_result[:, 0]
program_df['tsne-two-program'] = program_tsne_result[:, 1]

# Plot the program embeddings using t-SNE results
fig = px.scatter(
    program_df,
    x='tsne-one-program',
    y='tsne-two-program',
    hover_data=['title'],
    title='Program Embeddings t-SNE'
)
fig.show()

# UMAP

In [62]:
# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# Fit and transform the program embeddings
program_umap_result = umap_projection.fit_transform(program_embeddings)

# Add UMAP results to your DataFrame
program_df['umap-one-program'] = program_umap_result[:, 0]
program_df['umap-two-program'] = program_umap_result[:, 1]

# Plot the UMAP projection results
fig = px.scatter(
    program_df,
    x='umap-one-program',
    y='umap-two-program',
    hover_data=['title'],
    title='Program Embeddings UMAP Projection'
)
fig.show()




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



#  Generate Embeddings for Courses

In [63]:
# List of columns to include
columns_to_include = [
    'code', 'title', 'description', 'cycle', 'credits'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    course_df[col] = course_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
course_texts = course_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
course_embeddings = generate_embeddings(course_texts.tolist(), tokenizer, model)

# Add embeddings to program_df
course_df['vector'] = list(course_embeddings)

vectors = np.stack(course_df['vector'].values)

# Visualize Course Embeddings using PCA

In [64]:
pca = PCA(n_components=2)
course_pca_result = pca.fit_transform(course_embeddings)
course_df['pca-one-course'] = course_pca_result[:, 0]
course_df['pca-two-course'] = course_pca_result[:, 1]

# Plot the course embeddings using PCA
fig = px.scatter(
    course_df,
    x='pca-one-course',
    y='pca-two-course',
    hover_data=['title'],
    title='Course Embeddings PCA'
)
fig.show()

# Visualize Course Embeddings using t-SNE

In [65]:
tsne = TSNE(n_components=2, perplexity=30, max_iter=1000, random_state=42)
course_tsne_result = tsne.fit_transform(course_embeddings)
course_df['tsne-one-course'] = course_tsne_result[:, 0]
course_df['tsne-two-course'] = course_tsne_result[:, 1]

# Plot the course embeddings using t-SNE
fig = px.scatter(
    course_df,
    x='tsne-one-course',
    y='tsne-two-course',
    hover_data=['title'],
    title='Course Embeddings t-SNE'
)
fig.show()

# UMAP

In [66]:
# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# Fit and transform the course embeddings
course_umap_result = umap_projection.fit_transform(course_embeddings)

# Add UMAP results to your DataFrame
course_df['umap-one-course'] = course_umap_result[:, 0]
course_df['umap-two-course'] = course_umap_result[:, 1]

# Plot the course embeddings using UMAP results
fig = px.scatter(
    course_df,
    x='umap-one-course',
    y='umap-two-course',
    hover_data=['title'],
    title='Course Embeddings UMAP Projection'
)
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



# Connect the Programs and Courses using Embeddings

In [67]:
# Ensure consistent data types for merge keys
program_course_df['courseId'] = program_course_df['courseId'].astype(str)
course_df['id'] = course_df['id'].astype(str)

program_course_df['programId'] = program_course_df['programId'].astype(str)
program_df['id'] = program_df['id'].astype(str)


In [68]:

# Merge DataFrames with suffixes to differentiate columns
combined_df = pd.merge(
    program_course_df,
    program_df,
    left_on='programId',
    right_on='id',
    how='left',
    suffixes=('_program_course', '_program')
)

# Print columns after first merge
print("Columns after merging program_course_df and program_df:")
print(combined_df.columns.tolist())


Columns after merging program_course_df and program_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program']


In [69]:

# Merge with course_df
combined_df = pd.merge(
    combined_df,
    course_df,
    left_on='courseId',
    right_on='id',
    how='left',
    suffixes=('', '_course')
)

# Print columns after second merge
print("Columns after merging with course_df:")
print(combined_df.columns.tolist())


Columns after merging with course_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program', 'code_course', 'title_course', 'description', 'credits_course', 'createdAt', 'updatedAt', 'id_course', 'cycle_course', 'vector_course', 'pca-one-course', 'pca-two-course', 'tsne-one-course', 'tsne-two-course', 'umap-one-course', 'umap-two-course']


In [70]:

# Optional: Rename columns for clarity
combined_df.rename(columns={
    'title': 'title_program',
    'code': 'code_program',
    'cycle': 'cycle_program',
    'credits': 'credits_program',
    'horaireCoursPdfJson': 'horaireCoursPdfJson_program',
    'title_course': 'title_course',
    'code_course': 'code_course',
    'cycle_course': 'cycle_course',
    'credits_course': 'credits_course',
    'description': 'description_course'
}, inplace=True)

# Updated list of columns to include in the combined text
columns_to_include = [
    'programId', 'courseId', 'type',
    'title_program', 'code_program', 'cycle_program', 'credits_program', 'horaireCoursPdfJson_program',
    'title_course', 'code_course', 'cycle_course', 'credits_course', 'description_course'
]


In [None]:
# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    combined_df[col] = combined_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each record
combined_texts = combined_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
combined_embeddings = generate_embeddings(combined_texts.tolist(), tokenizer, model)

# Add embeddings to combined_df
combined_df['vector'] = list(combined_embeddings)


In [None]:
pca = PCA(n_components=2)
combined_pca_result = pca.fit_transform(combined_embeddings)
combined_df['pca-one-combined'] = combined_pca_result[:, 0]
combined_df['pca-two-combined'] = combined_pca_result[:, 1]

# Plot the program embeddings
fig = px.scatter(
    combined_df,
    x='pca-one-combined',
    y='pca-two-combined',
    color='title_program',
    hover_data=['title_program', 'title_course'],
    title='Combined Embeddings PCA'
)
fig.show()

In [None]:
tsne = TSNE(n_components=2, perplexity=50, max_iter=1000, random_state=42)
combined_tsne_result = tsne.fit_transform(combined_embeddings)
combined_df['tsne-one-combined'] = combined_tsne_result[:, 0]
combined_df['tsne-two-combined'] = combined_tsne_result[:, 1]

# Plot the program embeddings using t-SNE results
fig = px.scatter(
    combined_df,
    x='tsne-one-combined',
    y='tsne-two-combined',
    color='title_program',
    hover_data=['title_program', 'title_course'],
    title='Combined Embeddings t-SNE'
)
fig.show()

In [None]:
# Check the number of unique programs
num_unique_programs = combined_df['title_program'].nunique()
print(f"Number of unique programs: {num_unique_programs}")

# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# Fit and transform the combined embeddings
combined_umap_result = umap_projection.fit_transform(combined_embeddings)

# Add UMAP results to the DataFrame
combined_df['umap-one-combined'] = combined_umap_result[:, 0]
combined_df['umap-two-combined'] = combined_umap_result[:, 1]

# Plot the combined embeddings using UMAP
fig = px.scatter(
    combined_df,
    x='umap-one-combined',
    y='umap-two-combined',
    color='title_program',
    hover_data=['title_program', 'title_course'],
    title='Combined Embeddings UMAP Colored by Program'
)
fig.show()


Number of unique programs: 44



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



# Implement KNN to Find Similar Courses

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(vectors)

def find_similar_courses(index, n_neighbors=5):
    course_embedding = vectors[index].reshape(1, -1)
    distances, indices = knn.kneighbors(course_embedding, n_neighbors=n_neighbors+1)
    similar_courses = []
    for i in range(1, len(indices[0])):
        idx = indices[0][i]
        similar_courses.append({
            'course_id': course_df.iloc[idx]['id'],
            'title': course_df.iloc[idx]['title'],
            'cycle': course_df.iloc[idx]['cycle'],
            'distance': distances[0][i]
        })
        
    return similar_courses


In [None]:
# Example usage
index_to_query = 2
similar_courses = find_similar_courses(index_to_query, n_neighbors=5)

queried_course = course_df.iloc[index_to_query]
print(f"Courses similar to '{queried_course['title']}' in cycle '{queried_course['cycle']} (ID: {queried_course['description']}):")
for course in similar_courses:
    print(f"- {course['title']} (Cycle: {course['cycle']}, Description: {course['course_id']}, Distance: {course['distance']:.4f})")

Courses similar to 'Analytiques des données massives' in cycle '2 (ID: Ce cours présente les concepts pour effectuer une analyse statistique de très grands ensembles de données qui ne tiennent pas sur un seul ordinateur. Ce cours vous permettra développer vos connaissances en analyse de données massives et améliorerez vos compétences en programmation et en mathématiques. Vous apprendrez à utiliser des outils analytiques essentiels pour l’analyse statistique des données massives. Plusieurs problèmes applicatifs seront étudiés et différentes...):
- Introduction à l’analyse des mégadonnées (Cycle: 1, Description: 352303, Distance: 0.2993)
- Apprentissage machine quantique (Cycle: 1, Description: 352077, Distance: 0.3420)
- Méthodes expérimentales en géotechnique (Cycle: 2, Description: 353210, Distance: 0.3552)
- Sujets spéciaux en entreprise numérique (Cycle: 2, Description: 350757, Distance: 0.3675)
- Base de données (Cycle: 1, Description: 351653, Distance: 0.3733)


# Recommandation

In [None]:
# Extract embeddings from combined_df
combined_vectors = np.stack(combined_df['vector'].values)

# Initialize KNN with cosine distance
knn_combined = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit KNN on combined embeddings
knn_combined.fit(combined_vectors)

def find_similar_combinations(index, n_neighbors=5):
    """
    Find similar program-course combinations based on embeddings.

    Parameters:
    - index (int): Index of the record in combined_df to query.
    - n_neighbors (int): Number of similar combinations to retrieve.

    Returns:
    - List of dictionaries containing details of similar combinations.
    """
    # Validate the index
    if index < 0 or index >= len(combined_df):
        raise IndexError("Index out of bounds for combined_df.")

    # Get the embedding for the specified index
    combination_embedding = combined_vectors[index].reshape(1, -1)

    # Find nearest neighbors (including the query itself)
    distances, indices = knn_combined.kneighbors(combination_embedding, n_neighbors=n_neighbors + 1)

    similar_combinations = []
    for i in range(1, len(indices[0])):  # Start from 1 to exclude the query itself
        idx = indices[0][i]
        similar_combinations.append({
            'program_id': combined_df.iloc[idx]['programId'],
            'program_title': combined_df.iloc[idx]['title_program'],
            'course_id': combined_df.iloc[idx]['courseId'],
            'course_title': combined_df.iloc[idx]['title_course'],
            'cycle': combined_df.iloc[idx]['cycle_course'],
            'distance': distances[0][i]
        })
    return similar_combinations


In [None]:
# Example usage
index_to_query = 2  # Change this index based on your data

try:
    # Find similar program-course combinations
    similar_combinations = find_similar_combinations(index_to_query, n_neighbors=5)

    # Get details of the queried combination
    queried_combination = combined_df.iloc[index_to_query]

    print(f"Program-Course combinations similar to '{queried_combination['title_program']}' "
          f"program and '{queried_combination['title_course']}' course "
          f"in cycle '{queried_combination['cycle_course']}' (Program ID: {queried_combination['programId']}, "
          f"Course ID: {queried_combination['courseId']}):\n")

    for combo in similar_combinations:
        print(f"- Program: {combo['program_title']} (ID: {combo['program_id']}), "
              f"Course: {combo['course_title']} (ID: {combo['course_id']}), "
              f"Cycle: {combo['cycle']}, Distance: {combo['distance']:.4f}")
except IndexError as e:
    print(f"Error: {e}")


Program-Course combinations similar to 'Certificat en gestion immobilière' program and 'Gestion des actifs immobiliers' course in cycle '1' (Program ID: 182976, Course ID: 352245):

- Program: Programme court en gestion immobilière (ID: 183016), Course: Gestion des actifs immobiliers (ID: 352245), Cycle: 1, Distance: 0.0255
- Program: Programme court en gestion industrielle (ID: 183024), Course: Gestion du personnel et relations industrielles (ID: 351827), Cycle: 1, Distance: 0.2179
- Program: Baccalauréat en génie des opérations et de la logistique (ID: 182880), Course: Conseils et spécificités sectorielles (ID: 351261), Cycle: 1, Distance: 0.2315
- Program: Certificat en gestion immobilière (ID: 182976), Course: Gestion du personnel et relations industrielles (ID: 351827), Cycle: 1, Distance: 0.2358
- Program: Programme court en gestion immobilière (ID: 183016), Course: Gestion du personnel et relations industrielles (ID: 351827), Cycle: 1, Distance: 0.2382


# One program recommandation of courses

In [None]:
def tokenize_sentences(text):
    """
    Tokenizes the input text into sentences using NLTK's sent_tokenize for French.
    """
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

def preprocess_text_input(text):
    """
    Preprocesses the input text by removing stopwords and tokenizing.
    """
    sentences = tokenize_sentences(text)
    preprocessed = preprocess_text(sentences)
    return preprocessed

def embed_text(text):
    """
    Generates embedding for a single piece of text.
    """
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

def find_similar_combinations_within_program(program_id, text_input=None, n_neighbors=5):
    """
    Find similar program-course combinations within a specific program based on an optional text input.

    Parameters:
    - program_id (str): Identifier of the program to restrict the search.
    - text_input (str): Optional text input to base similarity on.
    - n_neighbors (int): Number of similar combinations to retrieve.

    Returns:
    - List of dictionaries containing details of similar combinations.
    """
    # Step 1: Filter the DataFrame to include only the specified program
    program_subset = combined_df[combined_df['programId'] == program_id]

    if program_subset.empty:
        raise ValueError(f"No data found for program ID: {program_id}")

    # Step 2: Extract embeddings for the subset
    subset_vectors = np.stack(program_subset['vector'].values)

    # Step 3: Initialize and fit KNN on the subset
    knn_program = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_program.fit(subset_vectors)

    # Step 4: Handle text input (if provided)
    if text_input:
        # Preprocess and embed the input text
        preprocessed_text = preprocess_text_input(text_input)
        input_embedding = embed_text(preprocessed_text)
    else:
        # If no text input, default to the first combination in the subset
        input_embedding = subset_vectors[0].reshape(1, -1)

    # Step 5: Perform KNN search
    distances, indices = knn_program.kneighbors(input_embedding, n_neighbors=n_neighbors + 1)

    # Step 6: Compile similar combinations (excluding the query itself)
    similar_combinations = []
    for i in range(1, len(indices[0])):  # Start from 1 to exclude the query itself
        idx = indices[0][i]
        combo = program_subset.iloc[idx]
        similar_combinations.append({
            'program_id': combo['programId'],
            'program_title': combo['title_program'],
            'course_id': combo['courseId'],
            'course_title': combo['title_course'],
            'cycle': combo['cycle_course'],
            'distance': distances[0][i]
        })

    return similar_combinations

In [None]:
def visualize_similar_combinations_within_program(program_id, text_input=None, n_neighbors=5):
    try:
        # Find similar combinations
        similar_combinations = find_similar_combinations_within_program(
            program_id=program_id,
            text_input=text_input,
            n_neighbors=n_neighbors
        )

        # Get the course IDs of similar combinations
        similar_course_ids = [combo['course_id'] for combo in similar_combinations]

        # Filter the DataFrame for the specified program
        program_subset = combined_df[combined_df['programId'] == program_id]

        # Create the base t-SNE scatter plot
        fig = px.scatter(
            program_subset,
            x='tsne-one-combined',
            y='tsne-two-combined',
            color='title_program',
            hover_data=['title_course'],
            title=f"'{program_id}' Embeddings with Similar Courses Highlighted",
            labels={
                'tsne-one-combined': 't-SNE Dimension 1',
                'tsne-two-combined': 't-SNE Dimension 2'
            }
        )

        # Highlight similar courses
        similar_df = program_subset[program_subset['courseId'].isin(similar_course_ids)]
        fig.add_trace(
            px.scatter(
                similar_df,
                x='tsne-one-combined',
                y='tsne-two-combined',
                hover_data=['title_course']
            ).data[0]
        )

        fig.show()
    except (IndexError, ValueError) as e:
        print(f"Error: {e}")


In [None]:
# Example usage
program_id_to_query = '182912'
text_input = "histoire de l'art"

try:
    similar_combinations = find_similar_combinations_within_program(
        program_id=program_id_to_query,
        text_input=text_input,
        n_neighbors=5
    )

    queried_program = combined_df[combined_df['programId'] == program_id_to_query].iloc[0]

    print(f"Program-Course combinations within program '{queried_program['title_program']}' similar to the input text:\n")

    for combo in similar_combinations:
        print(f"- Course: {combo['course_title']} (ID: {combo['course_id']}), "
              f"Cycle: {combo['cycle']}, Distance: {combo['distance']:.4f}")
    
    # Optional: Visualize the results
    visualize_similar_combinations_within_program(program_id_to_query, text_input, n_neighbors=5)
except (IndexError, ValueError) as e:
    print(f"Error: {e}")

Program-Course combinations within program 'Baccalauréat en génie des technologies de l'information' similar to the input text:

- Course: Encadrement de la profession et éthique professionnelle (ID: 353396), Cycle: 1, Distance: 0.6488
- Course: Rédaction technique et communication en génie des TI (ID: 349748), Cycle: 1, Distance: 0.6509
- Course: Bases de données multimédias (ID: 351997), Cycle: 1, Distance: 0.6636
- Course: Conception orientée objet (ID: 352405), Cycle: 1, Distance: 0.6680
- Course: Règles de base en santé et sécurité (ID: 787696), Cycle: 1, Distance: 0.6701
