In [None]:
!pip install gensim




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api

# Load trained models
cbow_model = Word2Vec.load(r"/content/drive/MyDrive/HW2/cbow.model")
skipgram_model = Word2Vec.load(r"/content/drive/MyDrive/HW2/skipgram.model")

print("Trained embeddings loaded successfully!")

# Load pre-trained GloVe embeddings (100-dimensional)
glove_model = api.load("glove-wiki-gigaword-100")
print("GloVe embeddings loaded!")


Trained embeddings loaded successfully!
GloVe embeddings loaded!


In [None]:
# Load pre-trained FastText embeddings (300-dimensional) - full version
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
print("FastText embeddings loaded!")

FastText embeddings loaded!


In [None]:
def compare_vector_arithmetic(positive, negative, models):
    results = {}
    for name, model in models.items():
        try:
            results[name] = model.most_similar(positive=positive, negative=negative, topn=1)
        except KeyError:
            results[name] = "Words not in vocabulary"
    return results

# Business/Tech Vector Arithmetic Queries

# Query 1: "invest" - "risk" + "reward"
print("Query 1: 'invest' - 'risk' + 'return'")
print(compare_vector_arithmetic(positive=["invest", "return"], negative=["risk"], models=models))
print("----------------------------------------------------")

# Query 2: "technology" - "old" + "new"
print("Query 2: 'technology' - 'old' + 'new'")
print(compare_vector_arithmetic(positive=["technology", "new"], negative=["old"], models=models))
print("----------------------------------------------------")

# Query 3: "student" - "quit" + "graduate"
print("Query 3: 'student' - 'quit' + 'graduate'")
print(compare_vector_arithmetic(positive=["software", "graduate"], negative=["quit"], models=models))
print("----------------------------------------------------")

# Query 4: "profit" - "loss" + "growth"
print("Query 4: 'profit' - 'loss' + 'growth'")
print(compare_vector_arithmetic(positive=["profit", "growth"], negative=["loss"], models=models))
print("----------------------------------------------------")

# Query 5: "leader" - "male" + "female"
print("Query 5: 'leader' - 'male' + 'female'")
print(compare_vector_arithmetic(positive=["leader", "female"], negative=["male"], models=models))
print("----------------------------------------------------")


Query 1: 'invest' - 'risk' + 'reward'
{'CBOW': [('pay', 0.6269975304603577)], 'Skip-gram': [('buy', 0.5625247955322266)], 'GloVe': [('invested', 0.637974202632904)], 'FastText': [('reinvest', 0.6238330006599426)]}
----------------------------------------------------
Query 2: 'technology' - 'old' + 'new'
{'CBOW': [('innovation', 0.5668476223945618)], 'Skip-gram': [('playability', 0.5604642033576965)], 'GloVe': [('technologies', 0.7700199484825134)], 'FastText': [('technologies', 0.7342677712440491)]}
----------------------------------------------------
Query 3: 'student' - 'quit' + 'graduate'
{'CBOW': 'Words not in vocabulary', 'Skip-gram': 'Words not in vocabulary', 'GloVe': [('computer', 0.6472473740577698)], 'FastText': [('undergraduate', 0.631050705909729)]}
----------------------------------------------------
Query 4: 'profit' - 'loss' + 'growth'
{'CBOW': [('economy', 0.6394561529159546)], 'Skip-gram': [('economy', 0.5914967060089111)], 'GloVe': [('profits', 0.692354142665863)], 'F

In [None]:
pip install wefe==0.2.1



In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import gensim.downloader as api

# Load self-trained models (update paths as needed)
cbow_model = Word2Vec.load(r"/content/drive/MyDrive/HW2/cbow.model")
skipgram_model = Word2Vec.load(r"/content/drive/MyDrive/HW2/skipgram.model")

# Load pre-trained embeddings via Gensim downloader
glove_model = api.load("glove-wiki-gigaword-100")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")  # Already a KeyedVectors object

# Organize the models into a dictionary
models = {
    "CBOW": cbow_model.wv,
    "Skip-gram": skipgram_model.wv,
    "GloVe": glove_model,
    "FastText": fasttext_model
}

def compute_rnsb(model, target1, target2, attribute1, attribute2):
    """
    Compute a simple Relative Norm Bias score.

    Parameters:
      - model: a gensim KeyedVectors model.
      - target1: list of words (e.g., male terms).
      - target2: list of words (e.g., female terms).
      - attribute1: list of attribute words for one group (e.g., leadership attributes).
      - attribute2: list of attribute words for the other group (e.g., subordinate attributes).

    The attribute baseline is defined as the average of the embeddings in attribute1 and attribute2.
    Then, the bias score is calculated as:

         100 * (avg_norm(target1) - avg_norm(target2)) / (avg_norm(target1) + avg_norm(target2))

    where avg_norm(targetX) is the average Euclidean distance of each target word from the attribute baseline.
    """
    # Filter words to those present in the model's vocabulary
    target1_vecs = [model[word] for word in target1 if word in model]
    target2_vecs = [model[word] for word in target2 if word in model]
    attr1_vecs = [model[word] for word in attribute1 if word in model]
    attr2_vecs = [model[word] for word in attribute2 if word in model]

    if not target1_vecs or not target2_vecs or not attr1_vecs or not attr2_vecs:
        raise ValueError("One of the sets has no words in the model's vocabulary.")

    # Compute the attribute baseline as the average of attribute1 and attribute2 vectors
    attr_baseline = np.mean(attr1_vecs + attr2_vecs, axis=0)

    # Compute the average Euclidean norm for each target set relative to the attribute baseline
    norms_target1 = [np.linalg.norm(vec - attr_baseline) for vec in target1_vecs]
    norms_target2 = [np.linalg.norm(vec - attr_baseline) for vec in target2_vecs]

    avg_norm1 = np.mean(norms_target1)
    avg_norm2 = np.mean(norms_target2)

    # Compute the relative norm bias score
    rnsb_score = 100 * (avg_norm1 - avg_norm2) / (avg_norm1 + avg_norm2)
    return rnsb_score

# Define query parameters (example: testing leadership gender bias)
target_1 = ["man", "male", "gentleman"]
target_2 = ["woman", "female", "lady"]
attribute_1 = ["leader", "director", "executive"]
attribute_2 = ["follower", "employee", "subordinate"]

# Compute RNSB for each model
results = {}
for model_name, model in models.items():
    try:
        score = compute_rnsb(model, target_1, target_2, attribute_1, attribute_2)
        results[model_name] = score
    except Exception as e:
        results[model_name] = f"Error: {e}"

# Convert results to a DataFrame and display them
df_results = pd.DataFrame.from_dict(results, orient="index", columns=["RNSB Score"])
print("RNSB Scores:")
print(df_results)


RNSB Scores:
           RNSB Score
CBOW       -11.402394
Skip-gram   -1.096881
GloVe       -2.926885
FastText     0.633558
