# Install necessary packages

In [1]:
!pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Using cached pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Using cached pip-24.3.1-py3-none-any.whl (1.8 MB)



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: To modify pip, please run the following command:
C:\Program Files\Python312\python.exe -m pip install --upgrade pip


In [2]:
!pip install -U torch numpy==1.26 pandas scikit-learn plotly nltk transformers==4.46.3 sentence-transformers einops datasets gradio networkx umap-learn ipywidgets

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Libraries

In [3]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from IPython.display import display
import umap
from multiprocessing import Process
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Download NLTK resources

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the CSV files to inspect their contents

In [5]:
current_dir = os.path.join(os.getcwd(), 'data')

In [6]:
program_df = pd.read_csv(os.path.join(current_dir, 'Program.csv'))
program_df['title'] = program_df['title'].str.replace(r'<[^>]*>', '', regex=True)

In [7]:
program_course_df = pd.read_csv(os.path.join(current_dir, 'ProgramCourse.csv'))

In [8]:
program_type_df = pd.read_csv(os.path.join(current_dir, 'ProgramType.csv'))

In [9]:
course_df = pd.read_csv(os.path.join(current_dir, 'Course.csv'))

In [10]:
# Define stopwords
stop_words = set(stopwords.words('french'))

def remove_stopwords(text):
    # Tokenize into words
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

In [11]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

In [12]:
def preprocess_text(text):
    # Remove stopwords
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    remove_stopwords(text)
    return text.strip()
    return no_stopwords

# Load MiniLM model and tokenizer for generating embeddings

In [13]:
models_to_test = [
    # Sentence-Transformers (general-purpose, multilingual)
    ("sentence-transformers/all-MiniLM-L6-v2", "MiniLM"),
    ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "Multilingual MiniLM-L12"),
    ("sentence-transformers/paraphrase-MiniLM-L6-v2", "MiniLM-L6"),
    ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "Multilingual MPNet"),
    ("sentence-transformers/all-distilroberta-v1", "DistilRoBERTa"),
    ("sentence-transformers/all-MiniLM-L12-v2", "MiniLM-L12"),
    
    # MPNet variants
    ("sentence-transformers/paraphrase-MiniLM-L3-v2", "MiniLM-L3"),

    # Large Transformer Models
    ("bert-base-nli-mean-tokens", "BERT Base NLI"),
    ("bert-large-nli-mean-tokens", "BERT Large NLI"),
    ("roberta-large-nli-stsb-mean-tokens", "RoBERTa Large STS-B"),
    ("roberta-base-nli-stsb-mean-tokens", "RoBERTa Base STS-B"),

    # Multilingual Models
    ("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens", "XLM-RoBERTa Base Multilingual"),
    ("sentence-transformers/stsb-xlm-r-multilingual", "STSB XLM-R Multilingual"),

    # Dense Retrieval and Domain-Specific Models
    ("sentence-transformers/msmarco-distilbert-base-tas-b", "MS MARCO DistilBERT"),
    ("sentence-transformers/msmarco-bert-base-dot-v5", "MS MARCO BERT"),
    ("sentence-transformers/msmarco-mpnet-base-dot-v5", "MS MARCO MPNet"),

    # T5 Variants
    ("t5-small", "T5 Small"),
    ("t5-base", "T5 Base"),
    ("t5-large", "T5 Large"),
    ("google/flan-t5-small", "FLAN-T5 Small"),
    ("google/flan-t5-base", "FLAN-T5 Base"),
    ("google/flan-t5-large", "FLAN-T5 Large"),

    # Newer Sentence-Transformers
    ("sentence-transformers/all-roberta-large-v1", "RoBERTa Large v1"),
    ("sentence-transformers/all-mpnet-base-v2", "MPNet Base v2"),
]

models_to_test_french = [
    # Multilingual Sentence-Transformers
    ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "Multilingual MiniLM-L12"),
    ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "Multilingual MPNet"),
    ("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens", "XLM-RoBERTa Base Multilingual"),
    ("sentence-transformers/stsb-xlm-r-multilingual", "STSB XLM-R Multilingual"),

    # T5 Variants (Multilingual Support)
    ("t5-small", "T5 Small"),
    ("t5-base", "T5 Base"),
    ("google/flan-t5-small", "FLAN-T5 Small"),
    ("google/flan-t5-base", "FLAN-T5 Base"),

    # Dense Retrieval Models
    ("sentence-transformers/msmarco-distilbert-base-tas-b", "MS MARCO DistilBERT"),
    ("sentence-transformers/msmarco-mpnet-base-dot-v5", "MS MARCO MPNet"),
]


In [None]:
# Loop through models and download
for model_name, model_label in models_to_test:
    try:
        print(f"Downloading {model_label} ({model_name})...")
        if "sentence-transformers" in model_name:
            model = SentenceTransformer(model_name)  # Sentence-Transformers models
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name)  # Hugging Face tokenizer
            model = AutoModel.from_pretrained(model_name)  # Hugging Face model
        print(f"Successfully downloaded {model_label} ({model_name})!")
    except Exception as e:
        print(f"Failed to download {model_label} ({model_name}). Error: {e}")

Downloading MiniLM (sentence-transformers/all-MiniLM-L6-v2)...
Failed to download MiniLM (sentence-transformers/all-MiniLM-L6-v2). Error: name 'SentenceTransformer' is not defined
Downloading Multilingual MiniLM-L12 (sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)...
Failed to download Multilingual MiniLM-L12 (sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2). Error: name 'SentenceTransformer' is not defined
Downloading MiniLM-L6 (sentence-transformers/paraphrase-MiniLM-L6-v2)...
Failed to download MiniLM-L6 (sentence-transformers/paraphrase-MiniLM-L6-v2). Error: name 'SentenceTransformer' is not defined
Downloading Multilingual MPNet (sentence-transformers/paraphrase-multilingual-mpnet-base-v2)...
Failed to download Multilingual MPNet (sentence-transformers/paraphrase-multilingual-mpnet-base-v2). Error: name 'SentenceTransformer' is not defined
Downloading DistilRoBERTa (sentence-transformers/all-distilroberta-v1)...
Failed to download DistilRoBERTa (sentence-t

model.safetensors:   3%|3         | 94.4M/2.95G [00:00<?, ?B/s]

Successfully downloaded T5 Large (t5-large)!
Downloading FLAN-T5 Small (google/flan-t5-small)...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Successfully downloaded FLAN-T5 Small (google/flan-t5-small)!
Downloading FLAN-T5 Base (google/flan-t5-base)...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [14]:
# Store embeddings and their visualizations
embedding_results = {}

# Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from IPython.display import display
import umap
from multiprocessing import Process
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


# Download NLTK resources

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the CSV files to inspect their contents

In [None]:
current_dir = os.path.join(os.getcwd(), 'data')

In [None]:
program_df = pd.read_csv(os.path.join(current_dir, 'Program.csv'))
program_df['title'] = program_df['title'].str.replace(r'<[^>]*>', '', regex=True)

In [None]:
program_course_df = pd.read_csv(os.path.join(current_dir, 'ProgramCourse.csv'))

In [None]:
program_type_df = pd.read_csv(os.path.join(current_dir, 'ProgramType.csv'))

In [None]:
course_df = pd.read_csv(os.path.join(current_dir, 'Course.csv'))

In [None]:
# Define stopwords
stop_words = set(stopwords.words('french'))

def remove_stopwords(text):
    # Tokenize into words
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

In [None]:
def preprocess_text(text):
    # Remove stopwords
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    remove_stopwords(text)
    return text.strip()
    return no_stopwords

# Load MiniLM model and tokenizer for generating embeddings

In [None]:
models_to_test = [
    # Sentence-Transformers (general-purpose, multilingual)
    ("sentence-transformers/all-MiniLM-L6-v2", "MiniLM"),
    ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "Multilingual MiniLM-L12"),
    ("sentence-transformers/paraphrase-MiniLM-L6-v2", "MiniLM-L6"),
    ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "Multilingual MPNet"),
    ("sentence-transformers/all-distilroberta-v1", "DistilRoBERTa"),
    ("sentence-transformers/all-MiniLM-L12-v2", "MiniLM-L12"),
    
    # MPNet variants
    ("sentence-transformers/paraphrase-MiniLM-L3-v2", "MiniLM-L3"),

    # Large Transformer Models
    ("bert-base-nli-mean-tokens", "BERT Base NLI"),
    ("bert-large-nli-mean-tokens", "BERT Large NLI"),
    ("roberta-large-nli-stsb-mean-tokens", "RoBERTa Large STS-B"),
    ("roberta-base-nli-stsb-mean-tokens", "RoBERTa Base STS-B"),

    # Multilingual Models
    ("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens", "XLM-RoBERTa Base Multilingual"),
    ("sentence-transformers/stsb-xlm-r-multilingual", "STSB XLM-R Multilingual"),

    # Dense Retrieval and Domain-Specific Models
    ("sentence-transformers/msmarco-distilbert-base-tas-b", "MS MARCO DistilBERT"),
    ("sentence-transformers/msmarco-bert-base-dot-v5", "MS MARCO BERT"),
    ("sentence-transformers/msmarco-mpnet-base-dot-v5", "MS MARCO MPNet"),

    # T5 Variants
    ("t5-small", "T5 Small"),
    ("t5-base", "T5 Base"),
    ("t5-large", "T5 Large"),
    ("google/flan-t5-small", "FLAN-T5 Small"),
    ("google/flan-t5-base", "FLAN-T5 Base"),
    ("google/flan-t5-large", "FLAN-T5 Large"),

    # Newer Sentence-Transformers
    ("sentence-transformers/all-roberta-large-v1", "RoBERTa Large v1"),
    ("sentence-transformers/all-mpnet-base-v2", "MPNet Base v2"),
]


In [None]:
# Store embeddings and their visualizations
embedding_results = {}

In [None]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a Function to Generate Embeddings

In [None]:
def generate_embeddings_with_model(text_list, model_name, model_label):
    """
    Generate embeddings using the specified model, adapted for T5.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)
    
    model.eval()  # Evaluation mode
    embeddings = []
    
    with torch.no_grad():  # No gradient calculations
        for i in range(0, len(text_list), 32):  # Batch processing
            batch_texts = text_list[i:i+32]
            tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
            tokens = {k: v.to(device) for k, v in tokens.items()}
            
            if "t5" in model_name:  # Special handling for T5
                outputs = model.encoder(**tokens)  # Use only the encoder
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            else:  # For other models
                outputs = model(**tokens)
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            
            embeddings.append(batch_embeddings.cpu())
    
    embeddings = torch.cat(embeddings, dim=0).numpy()
    return embeddings

# Generate Embeddings for Programs

In [None]:
# List of columns to include
columns_to_include = [
    'title', 'code', 'cycle', 'url', 'id'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    program_df[col] = program_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
program_texts = program_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

In [None]:
for model_name, model_label in models_to_test:
    # Generate embeddings for combined texts
    combined_embeddings = generate_embeddings_with_model(program_texts.tolist(), model_name, model_label)
    
    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(combined_embeddings)
    
    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    tsne_result = tsne.fit_transform(combined_embeddings)
    
    # UMAP
    umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    umap_result = umap_projection.fit_transform(combined_embeddings)
    
    # Store results
    embedding_results[model_label] = {
        "pca": pca_result,
        "tsne": tsne_result,
        "umap": umap_result,
        "dataframe": "program",
        "embeddings": combined_embeddings
    }
    
    # Visualize PCA
    fig_pca = px.scatter(
        x=pca_result[:, 0],
        y=pca_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - PCA"
    )
    fig_pca.show()
    
    # Visualize t-SNE
    fig_tsne = px.scatter(
        x=tsne_result[:, 0],
        y=tsne_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - t-SNE"
    )
    fig_tsne.show()
    
    # Visualize UMAP
    fig_umap = px.scatter(
        x=umap_result[:, 0],
        y=umap_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - UMAP"
    )
    fig_umap.show()

# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")

OSError: bert-base-nli-mean-tokens is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

#  Generate Embeddings for Courses

In [None]:
# List of columns to include
columns_to_include = [
    'code', 'title', 'description', 'cycle', 'credits'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    course_df[col] = course_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
course_texts = course_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

In [None]:
for model_name, model_label in models_to_test:
    # Generate embeddings for course texts
    course_embeddings = generate_embeddings_with_model(course_texts.tolist(), model_name, model_label)
    
    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(course_embeddings)
    
    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    tsne_result = tsne.fit_transform(course_embeddings)
    
    # UMAP
    umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    umap_result = umap_projection.fit_transform(course_embeddings)
    
    # Store results
    embedding_results[model_label] = {
        "pca": pca_result,
        "tsne": tsne_result,
        "umap": umap_result,
        "dataframe": "course",
        "embeddings": course_embeddings
    }
    
    # Visualize PCA
    fig_pca = px.scatter(
        x=pca_result[:, 0],
        y=pca_result[:, 1],
        color=course_df['title'],  # Color by course title
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - PCA"
    )
    fig_pca.show()
    
    # Visualize t-SNE
    fig_tsne = px.scatter(
        x=tsne_result[:, 0],
        y=tsne_result[:, 1],
        color=course_df['title'],
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - t-SNE"
    )
    fig_tsne.show()
    
    # Visualize UMAP
    fig_umap = px.scatter(
        x=umap_result[:, 0],
        y=umap_result[:, 1],
        color=course_df['title'],
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - UMAP"
    )
    fig_umap.show()


# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")

# Connect the Programs and Courses using Embeddings

In [None]:
# Ensure consistent data types for merge keys
program_course_df['courseId'] = program_course_df['courseId'].astype(str)
course_df['id'] = course_df['id'].astype(str)

program_course_df['programId'] = program_course_df['programId'].astype(str)
program_df['id'] = program_df['id'].astype(str)


In [None]:

# Merge DataFrames with suffixes to differentiate columns
combined_df = pd.merge(
    program_course_df,
    program_df,
    left_on='programId',
    right_on='id',
    how='left',
    suffixes=('_program_course', '_program')
)

# Print columns after first merge
print("Columns after merging program_course_df and program_df:")
print(combined_df.columns.tolist())


Columns after merging program_course_df and program_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program']


In [None]:

# Merge with course_df
combined_df = pd.merge(
    combined_df,
    course_df,
    left_on='courseId',
    right_on='id',
    how='left',
    suffixes=('', '_course')
)

# Print columns after second merge
print("Columns after merging with course_df:")
print(combined_df.columns.tolist())


Columns after merging with course_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program', 'code_course', 'title_course', 'description', 'credits_course', 'createdAt', 'updatedAt', 'id_course', 'cycle_course', 'vector_course', 'pca-one-course', 'pca-two-course', 'tsne-one-course', 'tsne-two-course', 'umap-one-course', 'umap-two-course']


In [None]:

# Optional: Rename columns for clarity
combined_df.rename(columns={
    'title': 'title_program',
    'code': 'code_program',
    'cycle': 'cycle_program',
    'credits': 'credits_program',
    'horaireCoursPdfJson': 'horaireCoursPdfJson_program',
    'title_course': 'title_course',
    'code_course': 'code_course',
    'cycle_course': 'cycle_course',
    'credits_course': 'credits_course',
    'description': 'description_course'
}, inplace=True)

# Updated list of columns to include in the combined text
columns_to_include = [
    'programId', 'courseId', 'type',
    'title_program', 'code_program', 'cycle_program', 'credits_program', 'horaireCoursPdfJson_program',
    'title_course', 'code_course', 'cycle_course', 'credits_course', 'description_course'
]


In [None]:
# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    combined_df[col] = combined_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each record
combined_texts = combined_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)


In [None]:
for model_name, model_label in models_to_test:
    # Generate embeddings for combined texts
    combined_embeddings = generate_embeddings_with_model(combined_texts.tolist(), model_name, model_label)
    
    # PCA for combined embeddings
    combined_pca = PCA(n_components=2)
    combined_pca_result = combined_pca.fit_transform(combined_embeddings)
    
    # t-SNE for combined embeddings
    combined_tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    combined_tsne_result = combined_tsne.fit_transform(combined_embeddings)
    
    # UMAP for combined embeddings
    combined_umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    combined_umap_result = combined_umap_projection.fit_transform(combined_embeddings)
    
    # Store results in a structured format
    if model_label not in embedding_results:
        embedding_results[model_label] = {}
    
    embedding_results[model_label]['combined'] = {
        "pca": combined_pca_result,
        "tsne": combined_tsne_result,
        "umap": combined_umap_result,
        "dataframe": "combined",
        "embeddings": combined_embeddings
    }
    
    # Visualize PCA for combined embeddings
    fig_combined_pca = px.scatter(
        x=combined_pca_result[:, 0],
        y=combined_pca_result[:, 1],
        color=combined_df['title_program'],  # Color by program title
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - PCA"
    )
    fig_combined_pca.show()
    
    # Visualize t-SNE for combined embeddings
    fig_combined_tsne = px.scatter(
        x=combined_tsne_result[:, 0],
        y=combined_tsne_result[:, 1],
        color=combined_df['title_program'],
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - t-SNE"
    )
    fig_combined_tsne.show()
    
    # Visualize UMAP for combined embeddings
    fig_combined_umap = px.scatter(
        x=combined_umap_result[:, 0],
        y=combined_umap_result[:, 1],
        color=combined_df['title_program'],
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - UMAP"
    )
    fig_combined_umap.show()

# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")

In [15]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a Function to Generate Embeddings

In [16]:
def generate_embeddings_with_model(text_list, model_name, model_label):
    """
    Generate embeddings using the specified model, adapted for T5.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)
    
    model.eval()  # Evaluation mode
    embeddings = []
    
    with torch.no_grad():  # No gradient calculations
        for i in range(0, len(text_list), 32):  # Batch processing
            batch_texts = text_list[i:i+32]
            tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
            tokens = {k: v.to(device) for k, v in tokens.items()}
            
            if "t5" in model_name:  # Special handling for T5
                outputs = model.encoder(**tokens)  # Use only the encoder
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            else:  # For other models
                outputs = model(**tokens)
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            
            embeddings.append(batch_embeddings.cpu())
    
    embeddings = torch.cat(embeddings, dim=0).numpy()
    return embeddings

# Generate Embeddings for Programs

In [17]:
# List of columns to include
columns_to_include = [
    'title', 'code', 'cycle', 'url', 'id'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    program_df[col] = program_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
program_texts = program_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

In [18]:
for model_name, model_label in models_to_test:
    # Generate embeddings for combined texts
    combined_embeddings = generate_embeddings_with_model(program_texts.tolist(), model_name, model_label)
    
    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(combined_embeddings)
    
    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    tsne_result = tsne.fit_transform(combined_embeddings)
    
    # UMAP
    umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    umap_result = umap_projection.fit_transform(combined_embeddings)
    
    # Store results
    embedding_results[model_label] = {
        "pca": pca_result,
        "tsne": tsne_result,
        "umap": umap_result,
        "dataframe": "program",
        "embeddings": combined_embeddings
    }
    
    # Visualize PCA
    fig_pca = px.scatter(
        x=pca_result[:, 0],
        y=pca_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - PCA"
    )
    fig_pca.show()
    
    # Visualize t-SNE
    fig_tsne = px.scatter(
        x=tsne_result[:, 0],
        y=tsne_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - t-SNE"
    )
    fig_tsne.show()
    
    # Visualize UMAP
    fig_umap = px.scatter(
        x=umap_result[:, 0],
        y=umap_result[:, 1],
        color=program_df['title'],
        hover_data={'code': program_df['code'], 'cycle': program_df['cycle']},
        title=f"{model_label} Embeddings - UMAP"
    )
    fig_umap.show()

# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")

OSError: bert-base-nli-mean-tokens is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

#  Generate Embeddings for Courses

In [20]:
# List of columns to include
columns_to_include = [
    'code', 'title', 'description', 'cycle', 'credits'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    course_df[col] = course_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
course_texts = course_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

In [None]:
for model_name, model_label in models_to_test:
    # Generate embeddings for course texts
    course_embeddings = generate_embeddings_with_model(course_texts.tolist(), model_name, model_label)
    
    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(course_embeddings)
    
    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    tsne_result = tsne.fit_transform(course_embeddings)
    
    # UMAP
    umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    umap_result = umap_projection.fit_transform(course_embeddings)
    
    # Store results
    embedding_results[model_label] = {
        "pca": pca_result,
        "tsne": tsne_result,
        "umap": umap_result,
        "dataframe": "course",
        "embeddings": course_embeddings
    }
    
    # Visualize PCA
    fig_pca = px.scatter(
        x=pca_result[:, 0],
        y=pca_result[:, 1],
        color=course_df['title'],  # Color by course title
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - PCA"
    )
    fig_pca.show()
    
    # Visualize t-SNE
    fig_tsne = px.scatter(
        x=tsne_result[:, 0],
        y=tsne_result[:, 1],
        color=course_df['title'],
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - t-SNE"
    )
    fig_tsne.show()
    
    # Visualize UMAP
    fig_umap = px.scatter(
        x=umap_result[:, 0],
        y=umap_result[:, 1],
        color=course_df['title'],
        hover_data={'code': course_df['code'], 'cycle': course_df['cycle']},
        title=f"{model_label} Course Embeddings - UMAP"
    )
    fig_umap.show()


# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")

# Connect the Programs and Courses using Embeddings

In [24]:
# Ensure consistent data types for merge keys
program_course_df['courseId'] = program_course_df['courseId'].astype(str)
course_df['id'] = course_df['id'].astype(str)

program_course_df['programId'] = program_course_df['programId'].astype(str)
program_df['id'] = program_df['id'].astype(str)


In [25]:

# Merge DataFrames with suffixes to differentiate columns
combined_df = pd.merge(
    program_course_df,
    program_df,
    left_on='programId',
    right_on='id',
    how='left',
    suffixes=('_program_course', '_program')
)

# Print columns after first merge
print("Columns after merging program_course_df and program_df:")
print(combined_df.columns.tolist())


Columns after merging program_course_df and program_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program']


In [26]:

# Merge with course_df
combined_df = pd.merge(
    combined_df,
    course_df,
    left_on='courseId',
    right_on='id',
    how='left',
    suffixes=('', '_course')
)

# Print columns after second merge
print("Columns after merging with course_df:")
print(combined_df.columns.tolist())


Columns after merging with course_df:
['createdAt_program_course', 'updatedAt_program_course', 'typicalSessionIndex', 'courseId', 'programId', 'type', 'code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt_program', 'updatedAt_program', 'title', 'url', 'cycle', 'id', 'vector', 'pca-one-program', 'pca-two-program', 'tsne-one-program', 'tsne-two-program', 'umap-one-program', 'umap-two-program', 'code_course', 'title_course', 'description', 'credits_course', 'createdAt', 'updatedAt', 'id_course', 'cycle_course', 'vector_course', 'pca-one-course', 'pca-two-course', 'tsne-one-course', 'tsne-two-course', 'umap-one-course', 'umap-two-course']


In [27]:

# Optional: Rename columns for clarity
combined_df.rename(columns={
    'title': 'title_program',
    'code': 'code_program',
    'cycle': 'cycle_program',
    'credits': 'credits_program',
    'horaireCoursPdfJson': 'horaireCoursPdfJson_program',
    'title_course': 'title_course',
    'code_course': 'code_course',
    'cycle_course': 'cycle_course',
    'credits_course': 'credits_course',
    'description': 'description_course'
}, inplace=True)

# Updated list of columns to include in the combined text
columns_to_include = [
    'programId', 'courseId', 'type',
    'title_program', 'code_program', 'cycle_program', 'credits_program', 'horaireCoursPdfJson_program',
    'title_course', 'code_course', 'cycle_course', 'credits_course', 'description_course'
]


In [28]:
# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    combined_df[col] = combined_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each record
combined_texts = combined_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)


In [None]:
for model_name, model_label in models_to_test:
    # Generate embeddings for combined texts
    combined_embeddings = generate_embeddings_with_model(combined_texts.tolist(), model_name, model_label)
    
    # PCA for combined embeddings
    combined_pca = PCA(n_components=2)
    combined_pca_result = combined_pca.fit_transform(combined_embeddings)
    
    # t-SNE for combined embeddings
    combined_tsne = TSNE(n_components=2, perplexity=30, max_iter=1000)
    combined_tsne_result = combined_tsne.fit_transform(combined_embeddings)
    
    # UMAP for combined embeddings
    combined_umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)
    combined_umap_result = combined_umap_projection.fit_transform(combined_embeddings)
    
    # Store results in a structured format
    if model_label not in embedding_results:
        embedding_results[model_label] = {}
    
    embedding_results[model_label]['combined'] = {
        "pca": combined_pca_result,
        "tsne": combined_tsne_result,
        "umap": combined_umap_result,
        "dataframe": "combined",
        "embeddings": combined_embeddings
    }
    
    # Visualize PCA for combined embeddings
    fig_combined_pca = px.scatter(
        x=combined_pca_result[:, 0],
        y=combined_pca_result[:, 1],
        color=combined_df['title_program'],  # Color by program title
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - PCA"
    )
    fig_combined_pca.show()
    
    # Visualize t-SNE for combined embeddings
    fig_combined_tsne = px.scatter(
        x=combined_tsne_result[:, 0],
        y=combined_tsne_result[:, 1],
        color=combined_df['title_program'],
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - t-SNE"
    )
    fig_combined_tsne.show()
    
    # Visualize UMAP for combined embeddings
    fig_combined_umap = px.scatter(
        x=combined_umap_result[:, 0],
        y=combined_umap_result[:, 1],
        color=combined_df['title_program'],
        hover_data={'title_course': combined_df['title_course'], 'cycle_course': combined_df['cycle_course']},
        title=f"{model_label} Combined Embeddings - UMAP"
    )
    fig_combined_umap.show()

# Evaluate Silhouette Score, Number of Clusters, and Inertia
for model_name, model_label in models_to_test:
    embeddings = embedding_results[model_label]['embeddings']
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embeddings, labels)

    print(f"Silhouette Score for {model_label}: {silhouette_avg}")
    print(f"Number of Clusters for {model_label}: {kmeans.n_clusters}")
    print(f"Inertia for {model_label}: {kmeans.inertia_}")

# Summary
print("Embeddings generated and visualized for the following models:")
for model_name, model_label in models_to_test:
    print(f"- {model_label} ({model_name})")