<a href="https://colab.research.google.com/github/AkshatBhatnagar29/Assg2-Topsis-on-Pretrained-Models/blob/main/Topsis_on_Pretrained_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q sentence-transformers scikit-learn pandas numpy scipy torch


In [None]:
import time
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr
from datasets import load_dataset

print("Loading STS Benchmark dataset...")
sts = load_dataset("glue", "stsb", split="validation")


models_list = [
    'sentence-transformers/all-MiniLM-L6-v2',
    'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/paraphrase-MiniLM-L6-v2',
    'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
    'sentence-transformers/all-distilroberta-v1'
]

In [None]:
df = pd.DataFrame(sts)

df["label"] = df["label"] / 5.0

df.head()


In [None]:
results = []

for model_id in models_list:
    model_name = model_id.split("/")[-1]
    print(f"\nEvaluating model: {model_name}")

    model = SentenceTransformer(model_id)

    start_time = time.time()

    # Batch encode sentences
    sentences1 = df["sentence1"].tolist()
    sentences2 = df["sentence2"].tolist()

    embeddings1 = model.encode(
        sentences1,
        batch_size=32,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    embeddings2 = model.encode(
        sentences2,
        batch_size=32,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # Cosine similarity (vectorized)
    model_scores = np.sum(embeddings1 * embeddings2, axis=1) / (
        np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1)
    )

    elapsed_time = time.time() - start_time
    avg_time = elapsed_time / len(df)

    # Correlation with human scores
    pearson = pearsonr(model_scores, df["label"])[0]
    spearman = spearmanr(model_scores, df["label"])[0]

    embedding_dim = model.get_sentence_embedding_dimension()

    results.append([
        model_name,
        pearson,
        spearman,
        avg_time,
        embedding_dim
    ])


In [None]:
decision_matrix = pd.DataFrame(
    results,
    columns=["Model", "Pearson", "Spearman", "Avg_Time", "Embedding_Dim"]
)

decision_matrix


In [None]:
model_sizes = {
    "all-MiniLM-L6-v2": 90,
    "all-mpnet-base-v2": 420,
    "paraphrase-MiniLM-L6-v2": 90,
    "multi-qa-MiniLM-L6-cos-v1": 90,
    "all-distilroberta-v1": 305
}

decision_matrix["Model_Size_MB"] = decision_matrix["Model"].map(model_sizes)
decision_matrix


In [None]:
topsis_data = decision_matrix.drop(columns=["Model"]).values

weights = np.array([0.3, 0.3, 0.15, 0.1, 0.15])

impacts = ['+', '+', '-', '-', '-']


In [None]:
norm = topsis_data / np.sqrt((topsis_data ** 2).sum(axis=0))
weighted = norm * weights


In [None]:
ideal_best = []
ideal_worst = []

for i, imp in enumerate(impacts):
    if imp == '+':
        ideal_best.append(weighted[:, i].max())
        ideal_worst.append(weighted[:, i].min())
    else:
        ideal_best.append(weighted[:, i].min())
        ideal_worst.append(weighted[:, i].max())

ideal_best = np.array(ideal_best)
ideal_worst = np.array(ideal_worst)


In [None]:
dist_best = np.sqrt(((weighted - ideal_best) ** 2).sum(axis=1))
dist_worst = np.sqrt(((weighted - ideal_worst) ** 2).sum(axis=1))

topsis_score = dist_worst / (dist_best + dist_worst)

decision_matrix["TOPSIS_Score"] = topsis_score
decision_matrix["Rank"] = decision_matrix["TOPSIS_Score"].rank(ascending=False)

decision_matrix.sort_values("Rank")


In [None]:
best_model = decision_matrix.sort_values("Rank").iloc[0]
best_model


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1. Create the dataset based on your result table
data = {
    'Model': [
        'paraphrase-MiniLM-L6-v2',
        'all-MiniLM-L6-v2',
        'multi-qa-MiniLM-L6-cos-v1',
        'all-distilroberta-v1',
        'all-mpnet-base-v2'
    ],
    'TOPSIS_Score': [0.980174, 0.974779, 0.889983, 0.495963, 0.107538]
}

df = pd.DataFrame(data)

# 2. Sort data to have the highest score on the left
df = df.sort_values(by='TOPSIS_Score', ascending=False)

# 3. Create the plot
plt.figure(figsize=(10, 6))
sns.set_theme(style="whitegrid")

# Create the bar plot
# We use a color palette that highlights the top ranks
barplot = sns.barplot(
    x='TOPSIS_Score',
    y='Model',
    data=df,
    palette='viridis',
    hue='Model',
    legend=False
)

# 4. Add labels and title
plt.title('TOPSIS Ranking of Pretrained Models', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('TOPSIS Score', fontsize=12)
plt.ylabel('Model Name', fontsize=12)
plt.xlim(0, 1.1)  # Set x-axis limit slightly above 1 for spacing

# 5. Add the score text at the end of each bar for clarity
for i, v in enumerate(df['TOPSIS_Score']):
    barplot.text(v + 0.01, i, f"{v:.4f}", va='center', fontweight='bold', color='black')

# 6. Save or Show the plot
plt.tight_layout()
plt.savefig('topsis_ranking_graph.png', dpi=300) # Saves high-res image
plt.show()