# Install necessary packages

In [12]:
!pip install --upgrade pip



In [13]:
!pip install torch numpy pandas scikit-learn plotly nltk transformers sentence-transformers einops datasets gradio networkx umap-learn



# Import Libraries

In [14]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from IPython.display import display
import umap

# Download NLTK resources

In [15]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Define stopwords
stop_words = set(stopwords.words('french'))

def remove_stopwords(text):
    # Tokenize into words
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

In [17]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

In [18]:
def preprocess_text(text):
    # Remove stopwords
    no_stopwords = remove_stopwords(text)
    return no_stopwords

# Load MiniLM model and tokenizer for generating embeddings

In [19]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
# model = AutoModel.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
# # Move model to GPU if available for faster computation
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# Load the CSV files to inspect their contents

In [20]:
current_dir = os.getcwd() + '/notebook/data'

In [21]:
program_df = pd.read_csv(os.path.join(current_dir, 'Program.csv'))
program_course_df = pd.read_csv(os.path.join(current_dir, 'ProgramCourse.csv'))
program_type_df = pd.read_csv(os.path.join(current_dir, 'ProgramType.csv'))
course_df = pd.read_csv(os.path.join(current_dir, 'Course.csv'))

# Inspect the DataFrames

In [22]:
print("Program DataFrame Head:")
display(program_df.head())

Program DataFrame Head:


Unnamed: 0,code,credits,horaireCoursPdfJson,planificationPdfJson,createdAt,updatedAt,title,url,cycle,id
0,648,15 crédits,,,2024-10-17 04:32:25.190,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en géni...,https://www.etsmtl.ca/programmes-formations/pr...,2,183146
1,569,15 crédits,,,2024-10-17 04:32:25.191,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en géni...,https://www.etsmtl.ca/programmes-formations/pr...,2,183156
2,514,15 crédits,,,2024-10-17 04:32:25.191,2024-11-02 01:50:20.409,Programme court de 2<sup>e</sup> cycle en gest...,https://www.etsmtl.ca/programmes-formations/pr...,2,183236
3,3294,30 crédits,,,2024-10-17 04:32:25.184,2024-11-02 01:50:20.408,DESS en projets internationaux et ingénierie g...,https://www.etsmtl.ca/programmes-formations/de...,2,183498
4,6646,"90 crédits, incluant 9 crédits de stage",,,2024-10-17 04:32:25.184,2024-11-02 01:50:20.408,Baccalauréat en informatique distribuée,https://www.etsmtl.ca/programmes-formations/ba...,1,182928


In [23]:
print("\nProgramCourse DataFrame Head:")
display(program_course_df.head())


ProgramCourse DataFrame Head:


Unnamed: 0,createdAt,updatedAt,typicalSessionIndex,courseId,programId,type
0,2024-11-02 01:50:21.887,2024-11-02 01:50:21.887,1,351029,183256,
1,2024-11-02 01:50:21.913,2024-11-02 01:50:21.913,3,353404,183040,TRONC
2,2024-11-02 01:50:21.958,2024-11-02 01:50:21.958,4,352245,182976,TRONC
3,2024-11-02 01:50:21.973,2024-11-02 01:50:21.973,4,351827,182976,TRONC
4,2024-11-02 01:50:21.981,2024-11-02 01:50:21.981,5,353458,182976,TRONC


In [24]:
print("\nProgramType DataFrame Head:")
display(program_type_df.head())


ProgramType DataFrame Head:


Unnamed: 0,id,title
0,697435,Maîtrise avec projet
1,738239,Microprogramme
2,697451,Maîtrise avec mémoire
3,915770,Concentration en technologies de la santé
4,697388,Doctorat


In [25]:
print("\nCourse DataFrame Head:")
display(course_df.head())


Course DataFrame Head:


Unnamed: 0,code,title,description,credits,createdAt,updatedAt,id,cycle
0,ATE800E,Academic Integrity : Concepts and Techniques,The ATE800 workshop must be passed in the firs...,0,2024-11-02 01:50:21.499,2024-11-02 01:50:21.569,407641,2
1,ELE735,Analyse numérique,"Au terme de ce cours, l'étudiante ou l'étudian...",3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,350543,1
2,MTI850,Analytiques des données massives,Ce cours présente les concepts pour effectuer ...,3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,353344,2
3,CHM015,Chimie préparatoire pour le génie (hors progra...,Ce cours vise à initier l’étudiante ou l'étudi...,3,2024-11-02 01:50:21.503,2024-11-02 01:50:21.773,349708,1
4,ELE752,Appareillage électrique,"Au terme de ce cours, l'étudiante ou l'étudian...",3,2024-11-02 01:50:21.502,2024-11-02 01:50:21.773,350599,1


In [26]:
print("\nDataFrame Columns:")
print("Program:", program_df.columns.tolist())
print("ProgramCourse:", program_course_df.columns.tolist())
print("ProgramType:", program_type_df.columns.tolist())
print("Course:", course_df.columns.tolist())


DataFrame Columns:
Program: ['code', 'credits', 'horaireCoursPdfJson', 'planificationPdfJson', 'createdAt', 'updatedAt', 'title', 'url', 'cycle', 'id']
ProgramCourse: ['createdAt', 'updatedAt', 'typicalSessionIndex', 'courseId', 'programId', 'type']
ProgramType: ['id', 'title']
Course: ['code', 'title', 'description', 'credits', 'createdAt', 'updatedAt', 'id', 'cycle']


# How many courses are in the dataset?

In [27]:
num_courses = course_df.shape[0]
print(f"Number of courses in the dataset: {num_courses}")

Number of courses in the dataset: 907


# How many unique courses are in the dataset?