In [None]:
!pip install --upgrade pip

In [None]:
!pip install -U torch numpy==1.26 pandas scikit-learn plotly nltk transformers==4.46.3 sentence-transformers einops datasets gradio networkx umap-learn ipywidgets

# Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from IPython.display import display
import umap
from multiprocessing import Process
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK resources

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load the CSV files to inspect their contents

In [None]:
current_dir = os.path.join(os.getcwd(), 'data')

In [None]:
program_df = pd.read_csv(os.path.join(current_dir, 'Program.csv'))
program_df['title'] = program_df['title'].str.replace(r'<[^>]*>', '', regex=True)

In [None]:
program_course_df = pd.read_csv(os.path.join(current_dir, 'ProgramCourse.csv'))

In [None]:
program_type_df = pd.read_csv(os.path.join(current_dir, 'ProgramType.csv'))

In [None]:
course_df = pd.read_csv(os.path.join(current_dir, 'Course.csv'))

In [None]:
# Define stopwords
stop_words = set(stopwords.words('french'))

def remove_stopwords(text):
    # Tokenize into words
    words = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text, language='french')
    return ' '.join(sentences)

In [None]:
def preprocess_text(text):
    # Remove stopwords
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    remove_stopwords(text)
    return text.strip()
    return no_stopwords

# Load MiniLM model and tokenizer for generating embeddings

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define a Function to Generate Embeddings

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.preprocessing import normalize

nltk.download('punkt')  # Ensure sentence tokenizer is available

def generate_embeddings(
    text_list, tokenizer, model, device=torch.device('cpu'), batch_size=32, pooling='mean', preprocess_fn=None
):
    """
    Generate embeddings for a list of texts using the specified pooling strategy.

    Args:
        text_list (list of str): List of input texts.
        tokenizer (transformers.AutoTokenizer): Tokenizer corresponding to the model.
        model (transformers.AutoModel): Pre-trained language model.
        device (torch.device): Device to perform computations on.
        batch_size (int, optional): Number of texts to process per batch. Defaults to 32.
        pooling (str, optional): Pooling strategy ('mean', 'max', 'concat'). Defaults to 'mean'.
        preprocess_fn (callable, optional): Preprocessing function for text. Defaults to None.

    Returns:
        np.ndarray: Generated embeddings.
    """
    if pooling not in ['mean', 'max', 'concat']:
        raise ValueError(f"Unsupported pooling type '{pooling}'. Choose from 'mean', 'max', 'concat'.")
    
    model.eval()  # Set model to evaluation mode
    embeddings = []

    # Preprocess text if a function is provided
    if preprocess_fn:
        text_list = [preprocess_fn(text) for text in text_list]

    with torch.no_grad():  # Disable gradient calculation
        for i in range(0, len(text_list), batch_size):
            batch_texts = text_list[i:i + batch_size]

            # Tokenize each text into sentences if too long
            tokenized_texts = [
                sent_tokenize(text) if len(text.split()) > 512 else [text] for text in batch_texts
            ]
            
            # Embed each sentence separately and aggregate
            batch_embeddings = []
            for sentences in tokenized_texts:
                # Tokenize and process sentences
                tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
                tokens = {k: v.to(device) for k, v in tokens.items()}
                outputs = model(**tokens)

                # Pool embeddings at sentence level
                if pooling == 'mean':
                    sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
                elif pooling == 'max':
                    sentence_embeddings = outputs.last_hidden_state.max(dim=1).values
                elif pooling == 'concat':
                    mean_pool = outputs.last_hidden_state.mean(dim=1)
                    max_pool = outputs.last_hidden_state.max(dim=1).values
                    sentence_embeddings = torch.cat((mean_pool, max_pool), dim=1)
                
                # Aggregate sentence embeddings
                text_embedding = sentence_embeddings.mean(dim=0)
                batch_embeddings.append(text_embedding)

            # Append batch embeddings
            embeddings.append(torch.stack(batch_embeddings).cpu())

    # Concatenate all batch embeddings
    embeddings = torch.cat(embeddings, dim=0)

    # Normalize embeddings
    embeddings = normalize(embeddings.numpy(), axis=1)

    print(f"Generated embeddings with '{pooling}' pooling. Embedding dimension: {embeddings.shape[1]}")

    return embeddings

# Generate Embeddings for Programs

In [None]:
# List of columns to include
columns_to_include = [
    'title', 'code', 'cycle', 'url', 'id'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    program_df[col] = program_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
program_texts = program_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
program_embeddings = generate_embeddings(program_texts.tolist(), tokenizer, model)

# Add embeddings to program_df
program_df['vector'] = list(program_embeddings)


# Embeddings using PCA

In [None]:
pca = pca = PCA(n_components=2, random_state=42)
program_pca_result = pca.fit_transform(program_embeddings)
program_df['pca-one-program'] = program_pca_result[:, 0]
program_df['pca-two-program'] = program_pca_result[:, 1]

# Embeddings using t-SNE

In [None]:
tsne = TSNE(n_components=2, perplexity=30, max_iter=1000, random_state=42)
program_tsne_result = tsne.fit_transform(program_embeddings)
program_df['tsne-one-program'] = program_tsne_result[:, 0]
program_df['tsne-two-program'] = program_tsne_result[:, 1]

# UMAP

In [None]:
# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)

# Fit and transform the program embeddings
program_umap_result = umap_projection.fit_transform(program_embeddings)

# Add UMAP results to your DataFrame
program_df['umap-one-program'] = program_umap_result[:, 0]
program_df['umap-two-program'] = program_umap_result[:, 1]

#  Generate Embeddings for Courses

In [None]:
# List of columns to include
columns_to_include = [
    'code', 'title', 'description', 'cycle', 'credits'
]

# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    course_df[col] = course_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each program
course_texts = course_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
course_embeddings = generate_embeddings(course_texts.tolist(), tokenizer, model)

# Add embeddings to program_df
course_df['vector'] = list(course_embeddings)

vectors = np.stack(course_df['vector'].values)

# Course Embeddings using PCA

In [None]:
pca = pca = PCA(n_components=2, random_state=42)
course_pca_result = pca.fit_transform(course_embeddings)
course_df['pca-one-course'] = course_pca_result[:, 0]
course_df['pca-two-course'] = course_pca_result[:, 1]


# Course Embeddings using t-SNE

In [None]:
tsne = TSNE(n_components=2, perplexity=30, max_iter=1000, random_state=42)
course_tsne_result = tsne.fit_transform(course_embeddings)
course_df['tsne-one-course'] = course_tsne_result[:, 0]
course_df['tsne-two-course'] = course_tsne_result[:, 1]


# UMAP

In [None]:
# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)

# Fit and transform the course embeddings
course_umap_result = umap_projection.fit_transform(course_embeddings)

# Add UMAP results to your DataFrame
course_df['umap-one-course'] = course_umap_result[:, 0]
course_df['umap-two-course'] = course_umap_result[:, 1]


# Connect the Programs and Courses using Embeddings

In [None]:
# Ensure consistent data types for merge keys
program_course_df['courseId'] = program_course_df['courseId'].astype(str)
course_df['id'] = course_df['id'].astype(str)

program_course_df['programId'] = program_course_df['programId'].astype(str)
program_df['id'] = program_df['id'].astype(str)

In [None]:
# Merge DataFrames with suffixes to differentiate columns
combined_df = pd.merge(
    program_course_df,
    program_df,
    left_on='programId',
    right_on='id',
    how='left',
    suffixes=('_program_course', '_program')
)

# Print columns after first merge
print("Columns after merging program_course_df and program_df:")
print(combined_df.columns.tolist())


In [None]:

# Merge with course_df
combined_df = pd.merge(
    combined_df,
    course_df,
    left_on='courseId',
    right_on='id',
    how='left',
    suffixes=('', '_course')
)

# Print columns after second merge
print("Columns after merging with course_df:")
print(combined_df.columns.tolist())


In [None]:
# Optional: Rename columns for clarity
combined_df.rename(columns={
    'title': 'title_program',
    'code': 'code_program',
    'cycle': 'cycle_program',
    'credits': 'credits_program',
    'horaireCoursPdfJson': 'horaireCoursPdfJson_program',
    'title_course': 'title_course',
    'code_course': 'code_course',
    'cycle_course': 'cycle_course',
    'credits_course': 'credits_course',
    'description': 'description_course'
}, inplace=True)

# Updated list of columns to include in the combined text
columns_to_include = [
    'programId', 'courseId', 'type',
    'title_program', 'code_program', 'cycle_program', 'credits_program', 'horaireCoursPdfJson_program',
    'title_course', 'code_course', 'cycle_course', 'credits_course', 'description_course'
]


In [None]:
# Ensure all columns are strings and handle NaN values
for col in columns_to_include:
    combined_df[col] = combined_df[col].astype(str).fillna('')

# Concatenate the columns into a single string for each record
combined_texts = combined_df[columns_to_include].apply(lambda x: ' '.join(x), axis=1)

# Generate embeddings for the combined texts
combined_embeddings = generate_embeddings(combined_texts.tolist(), tokenizer, model)

# Add embeddings to combined_df
combined_df['vector'] = list(combined_embeddings)


In [None]:
pca = pca = PCA(n_components=2, random_state=42)
combined_pca_result = pca.fit_transform(combined_embeddings)
combined_df['pca-one-combined'] = combined_pca_result[:, 0]
combined_df['pca-two-combined'] = combined_pca_result[:, 1]

In [None]:
tsne = TSNE(n_components=2, perplexity=50, max_iter=1000, random_state=42)
combined_tsne_result = tsne.fit_transform(combined_embeddings)
combined_df['tsne-one-combined'] = combined_tsne_result[:, 0]
combined_df['tsne-two-combined'] = combined_tsne_result[:, 1]


In [None]:
# Check the number of unique programs
num_unique_programs = combined_df['title_program'].nunique()
print(f"Number of unique programs: {num_unique_programs}")

# Initialize UMAP with desired parameters
umap_projection = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1)

# Fit and transform the combined embeddings
combined_umap_result = umap_projection.fit_transform(combined_embeddings)

# Add UMAP results to the DataFrame
combined_df['umap-one-combined'] = combined_umap_result[:, 0]
combined_df['umap-two-combined'] = combined_umap_result[:, 1]


In [None]:
def recommend_courses(user_completed_courses, user_liked_courses, program_id, combined_df, model_label, top_n=5):
    """
    Recommend courses for a user based on completed and liked courses.

    Args:
        user_completed_courses (list): List of course IDs the user has completed.
        user_liked_courses (list): List of course IDs the user likes or is interested in.
        program_id (str): The program ID to restrict recommendations.
        combined_df (pd.DataFrame): DataFrame containing course and program information.
        model_label (str): The embedding model label to use for recommendations.
        top_n (int): Number of recommendations to return.

    Returns:
        list: List of recommended courses with details.
    """
    # Filter courses in the user's program
    program_courses = combined_df[combined_df['programId'] == program_id]

    # Extract embeddings for courses in the program
    program_course_embeddings = np.stack(program_courses['vector'].values)
    program_course_ids = program_courses['courseId'].tolist()
    program_course_titles = program_courses['title_course'].tolist()

    # Get embeddings for completed and liked courses
    completed_embeddings = program_courses[program_courses['courseId'].isin(user_completed_courses)]['vector']
    liked_embeddings = program_courses[program_courses['courseId'].isin(user_liked_courses)]['vector']

    # Compute the user's profile embedding as the average of completed and liked embeddings
    profile_embedding = np.mean(
        np.concatenate([np.stack(completed_embeddings), np.stack(liked_embeddings)]), axis=0
    )

    # Compute similarity between profile embedding and program course embeddings
    similarities = cosine_similarity(profile_embedding.reshape(1, -1), program_course_embeddings).flatten()

    # Create a DataFrame to store recommendations
    recommendation_df = pd.DataFrame({
        'courseId': program_course_ids,
        'title': program_course_titles,
        'similarity': similarities
    })

    # Exclude completed courses
    recommendation_df = recommendation_df[~recommendation_df['courseId'].isin(user_completed_courses)]

    # Sort by similarity and select top N recommendations
    recommendation_df = recommendation_df.sort_values(by='similarity', ascending=False).head(top_n)

    # Convert to list of dictionaries for output
    recommendations = recommendation_df.to_dict('records')
    return recommendations

# Example usage
user_completed = ['C101', 'C102', 'C103']  # Replace with actual completed course IDs
user_liked = ['C104']  # Replace with actual liked course IDs
program_id = 'P101'  # Replace with the user's program ID

# Get recommendations
recommendations = recommend_courses(user_completed, user_liked, program_id, combined_df, model_label='MiniLM', top_n=5)

# Print recommendations
print("Recommended Courses:")
for rec in recommendations:
    print(f"- {rec['title']} (Course ID: {rec['courseId']}, Similarity: {rec['similarity']:.4f})")