1. Calculate similarity between each pair of course descriptions with zero-shot learning using `all-mpnet-base-v2` model.
2. When zero-shot learning fails, the few-shot learning is applied.For two course to be similar enough for reusabilty purpose, they should have overlap in all of the following:
   1. Subject area
   2. Level of difficulty (based on the cognitive level of Bloom's taxonomy)
   3. Tools and technologies used
   4. Type of data used
3. Training pairs are extracted from MIT's OpenCourseWare descriptions coupled with MIT's course syllabus's description for the same course (the two are not identical).
4. Further training and test pairs were generated by data augmentation techniques such as paraphrasing or altering the above-mentioned features in the course descriptions to create dissimilar pairs.

In [None]:
%%capture
%pip install datasets
%pip install sentence_transformers
#conda install accelerate=0.30.0

In [21]:
# environment: FAIR-OER
# libraries
import os
import csv
import pandas as pd
import random
from importlib import metadata
from datetime import datetime
import pytz
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pyarrow
import pyarrow.parquet
from datasets import load_dataset
from datasets import Features, Value
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
import ast
import numpy as np
import torch
from torch.optim import AdamW
from transformers import AdamW
from transformers import AutoTokenizer, AutoModel
from transformers import EarlyStoppingCallback
from scipy.spatial.distance import cosine
import torch.nn.functional as F

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")


GPU is available. Using: NVIDIA GeForce RTX 4090


In [4]:
# constant variables
metadata_dir = "/home/jovyan/Publication/fair_oer/05-Documentation/metadata.csv"
script = 'sbert_finetune_08-2024'
local_tz = pytz.timezone('Europe/Berlin')
local_time = datetime.now(local_tz).strftime('%Y%m%d')

In [4]:
def metadata_log(metadata_dir, output_dir, script, inputs_dir, functions):

  name =os.path.basename(output_dir)
  input = [os.path.basename(input_dir) for input_dir in inputs_dir]
  timestamp = datetime.now(local_tz).strftime('%y-%m-%d %H:%M:%S')

  metadata = {
      'name' :  name,
      'script' : script,
      'input' : input,
      'function' : functions,
      'timestamp' :  timestamp
  }

  with open(metadata_dir, 'a', newline='') as csvfile:
    fieldnames = ['name', 'script', 'input', 'function', 'timestamp']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow(metadata)

In [5]:
def normalization(input_dir, output_dir, script):
    input_dir=input_dir
    df = pd.read_csv(input_dir)

    df = df.applymap(lambda s: s.replace('\n', ' ').replace('\t', ' ') if type(s) == str else s)
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    df = df.replace({r'[^\w\s]':''}, regex=True)

    df.to_csv(output_dir, index=False)
    metadata_log(metadata_dir, output_dir, script, [input_dir], 'normalization')
    return df

In [None]:
# zero-shot learning
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

features = Features({
    'course_number_title': Value('string'),
    'description-mit': Value('string'),
    'description-ocw': Value('string'),
})

input_dir = R'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_raw_aggregated_20240804.csv'
ground_truth = load_dataset(
    'csv',
    data_files=input_dir,
    split='train'#,
    #features=features
)

similarity_score = []

for row in ground_truth:
    mit_embedding = model.encode(row['description-mit'])
    ocw_embedding = model.encode(row['description-ocw'])

    similarity = model.similarity(mit_embedding, ocw_embedding)

    similarity_score.append(similarity.item())

ground_truth = ground_truth.add_column("similarity_score", similarity_score)
output_dir = Rf"/content/drive/MyDrive/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/MIT_prediction_zeroshot-mpnet_{local_time}.csv"
ground_truth.to_csv(output_dir)
metadata_log(metadata_dir, output_dir, script, [input_dir], 'all-mpnet-base-v2')

In [None]:
input_dir=R'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_manual_20240826.xlsx'
df = pd.read_excel(input_dir, 'all' )
df=shuffle(df)
output_dir=Rf"/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_shuffle_{local_time}.csv"
df.to_csv(output_dir)
metadata_log(metadata_dir, output_dir, script, [input_dir], 'sklearn-shuffle')

In [None]:
input_dir=R'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_shuffle_20240829.csv'
output_dir=Rf"/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_normalization_{local_time}.csv"
normalization(input_dir, output_dir, script)

In [None]:
# data preparation
input_dir=R'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_normalization_20240829.csv'
output_dir_1 = Rf'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_train_{local_time}.csv'
output_dir_2 = Rf'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_eval_{local_time}.csv'
output_dir_3 = Rf'/content/drive/MyDrive/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_test_{local_time}.csv'

data=pd.read_csv(input_dir)

train_data, remaining_data = train_test_split(data, train_size=129, random_state=42)
eval_data, test_data = train_test_split(remaining_data, test_size=43, random_state=42)

train_data.to_csv(output_dir_1, index=False)
metadata_log(metadata_dir, output_dir_1, script, [input_dir], 'sklearn:train_test_split')
eval_data.to_csv(output_dir_2, index=False)
metadata_log(metadata_dir, output_dir_2, script, [input_dir], 'sklearn:train_test_split')
test_data.to_csv(output_dir_3, index=False)
metadata_log(metadata_dir, output_dir_3, script, [input_dir], 'sklearn:train_test_split')

In [6]:
course_features = Features({
    'description-mit': Value('string'),
    'description-ocw': Value('string'),
    'label': Value('float')
})

train_dataset = load_dataset(
    'parquet',
    data_files='/home/jovyan/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_train_20240829.parquet',
    split="train",
    features=course_features
)

eval_dataset = load_dataset(
    'parquet',
    data_files='/home/jovyan/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_eval_20240829.parquet',
    split="train",
    features=course_features
)

test_dataset = load_dataset(
    'parquet',
    data_files='/home/jovyan/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_test_20240829.parquet',
    split="train",
    features=course_features
)

In [22]:
# training arguments
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
loss = losses.CoSENTLoss(model)
optimizer = AdamW(model.parameters(), lr=2e-5)
train_batch_size = 256
num_epochs = 107
output_dir = ('/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024')

args = SentenceTransformerTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_steps=500,
    save_total_limit=3,
    logging_steps=300,
    run_name="few_shot_learning",
    evaluation_strategy="epoch",
)

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.


In [23]:
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["description-mit"],
    sentences2=eval_dataset["description-ocw"],
    scores=eval_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="fair-oer-dev",
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
    optimizers=(optimizer, None),
    )
trainer.train()

In [25]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["description-mit"],
    sentences2=test_dataset["description-ocw"],
    scores=test_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="fair-oer-test",
)
test_evaluator(model)

{'fair-oer-test_pearson_cosine': 0.7409764421917553,
 'fair-oer-test_spearman_cosine': 0.7473025735565767,
 'fair-oer-test_pearson_manhattan': 0.7363301285462346,
 'fair-oer-test_spearman_manhattan': 0.7390870824057955,
 'fair-oer-test_pearson_euclidean': 0.7413213451539604,
 'fair-oer-test_spearman_euclidean': 0.7473025735565767,
 'fair-oer-test_pearson_dot': 0.7409764734754448,
 'fair-oer-test_spearman_dot': 0.7473025735565767,
 'fair-oer-test_pearson_max': 0.7413213451539604,
 'fair-oer-test_spearman_max': 0.7473025735565767}

In [26]:
final_output_dir = f"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/model_finetuned_B256_E107_2e-5_{local_time}"
model.save(final_output_dir)
metadata_log(metadata_dir, final_output_dir, script, ['train', 'test', 'eval'], 'B256_E107_2e-5')

In [80]:
model_directory = R"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/model_finetuned_B256_E107_2e-5_20240908"
checkpoints = [model_directory]
data = pd.read_csv(R"/home/jovyan/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_test_20240829.csv")

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

for checkpoint in checkpoints:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModel.from_pretrained(checkpoint)
    model.eval()
    MITHB = data['description-mit'].tolist()
    MITOCW = data['description-ocw'].tolist()

    MITHB_tokens = []
    MITOCW_tokens = []
    for i in range(len(MITHB)):
        MITHB_tokens.append(tokenizer(MITHB[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))
        MITOCW_tokens.append(tokenizer(MITOCW[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))

    MITHB_embeddings = []
    MITOCW_embeddings = []
    for input in MITHB_tokens:
        with torch.no_grad():
            outputs = model(**input)
            MITHB_embeddings.append(outputs)
    for input in MITOCW_tokens:
        with torch.no_grad():
            outputs = model(**input)
            MITOCW_embeddings.append(outputs)

    MITHB_pooled_embeddings = []
    MITOCW_pooled_embeddings = []
    for i, embeddings in enumerate(MITHB_embeddings):
        mean_pooled = mean_pooling(embeddings, MITHB_tokens[i]['attention_mask'])
        MITHB_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))
    for i, embeddings in enumerate(MITOCW_embeddings):
        mean_pooled = mean_pooling(embeddings, MITOCW_tokens[i]['attention_mask'])
        MITOCW_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))

    similarity_scores = np.zeros((len(MITOCW_pooled_embeddings), 1))
    for i, (p, c) in enumerate(zip(MITOCW_pooled_embeddings, MITHB_pooled_embeddings)):
        p_expanded = p.squeeze()
        c_expanded = c.squeeze()
        similarity = F.cosine_similarity(p_expanded, c_expanded, dim=0)
        similarity_scores[i][0] = similarity.item()
    data['similarity_score'] = similarity_scores
    data.to_csv(rf"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/MIT_test_finetuned_{local_time}.csv", index=False)

In [12]:
model_directory = "sentence-transformers/all-mpnet-base-v2"
checkpoints = [model_directory]
data = pd.read_csv(R"/home/jovyan/Publication/fair_oer/02-Data/01-Processed/MIT_augmented_test_20240829.csv")

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

for checkpoint in checkpoints:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModel.from_pretrained(checkpoint)
    model.eval()
    MITHB = data['description-mit'].tolist()
    MITOCW = data['description-ocw'].tolist()

    MITHB_tokens = []
    MITOCW_tokens = []
    for i in range(len(MITHB)):
        MITHB_tokens.append(tokenizer(MITHB[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))
        MITOCW_tokens.append(tokenizer(MITOCW[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))

    MITHB_embeddings = []
    MITOCW_embeddings = []
    for input in MITHB_tokens:
        with torch.no_grad():
            outputs = model(**input)
            MITHB_embeddings.append(outputs)
    for input in MITOCW_tokens:
        with torch.no_grad():
            outputs = model(**input)
            MITOCW_embeddings.append(outputs)

    MITHB_pooled_embeddings = []
    MITOCW_pooled_embeddings = []
    for i, embeddings in enumerate(MITHB_embeddings):
        mean_pooled = mean_pooling(embeddings, MITHB_tokens[i]['attention_mask'])
        MITHB_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))
    for i, embeddings in enumerate(MITOCW_embeddings):
        mean_pooled = mean_pooling(embeddings, MITOCW_tokens[i]['attention_mask'])
        MITOCW_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))

    similarity_scores = np.zeros((len(MITOCW_pooled_embeddings), 1))
    for i, (p, c) in enumerate(zip(MITOCW_pooled_embeddings, MITHB_pooled_embeddings)):
        p_expanded = p.squeeze()
        c_expanded = c.squeeze()
        similarity = F.cosine_similarity(p_expanded, c_expanded, dim=0)
        similarity_scores[i][0] = similarity.item()
    data['similarity_score'] = similarity_scores
    data.to_csv(rf"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/MIT_test_mpnet_{local_time}.csv", index=False)



In [81]:
mpnet = pd.read_csv(R"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/MIT_test_mpnet_20240910.csv")
finetuned = pd.read_csv(R"/home/jovyan/Publication/fair_oer/02-Data/02-Analyzed/sbert_finetune_08-2024/MIT_test_finetuned_20240910.csv")

In [82]:
ground_truth = mpnet['label'].values
mpnet_eval = mpnet['similarity_score'].values
finetuned_eval = finetuned['similarity_score'].values

In [83]:
pearson_mpnet, _ = pearsonr(ground_truth, mpnet_eval)
spearman_mpnet, _ = spearmanr(ground_truth, mpnet_eval)

pearson_finetuned, _ = pearsonr(ground_truth, finetuned_eval)
spearman_finetuned, _ = spearmanr(ground_truth, finetuned_eval)

mse_mpnet = mean_squared_error(ground_truth, mpnet_eval)
mae_mpnet = mean_absolute_error(ground_truth, mpnet_eval)

mse_finetuned = mean_squared_error(ground_truth, finetuned_eval)
mae_finetuned = mean_absolute_error(ground_truth, finetuned_eval)

print(f"MPNet - Pearson: {pearson_mpnet}, Spearman: {spearman_mpnet}, MSE: {mse_mpnet}, MAE: {mae_mpnet}")
print(f"finetuned - Pearson: {pearson_finetuned}, Spearman: {spearman_finetuned}, MSE: {mse_finetuned}, MAE: {mae_finetuned}")

MPNet - Pearson: 0.6787011147787431, Spearman: 0.7036145879042915, MSE: 0.08350638970971137, MAE: 0.22125189387521074
finetuned - Pearson: 0.7409766478127822, Spearman: 0.7473025735565767, MSE: 0.045965845236970795, MAE: 0.16177826228789813
