In [None]:
import os
import numpy as np
import torch
import nltk
import spacy

from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity


nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


nlp = spacy.load("en_core_web_sm")


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
test = "On the question of foreign trade, previous leaders were guided by a shameful policy of capitulation, submission, and retreat."

In [None]:
# Load GloVe word vectors
def load_glove_embeddings(glove_file_path):
    embeddings_dict = {}
    try:
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = np.array(values[1:], dtype='float32')
                embeddings_dict[word] = vector
        print(f"Got {len(embeddings_dict)} word vectors!")
        return embeddings_dict
    except Error:
        return None

# Path to GloVe file
glove_file = "/content/glove.6B.50d.txt"
glove_embeddings = load_glove_embeddings(glove_file)

Got 400001 word vectors!


In [None]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def find_outlier_words(text):
    # tokenize
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    words = []
    for word in doc:
        if word.is_alpha and not word.is_stop:
            words.append(word.text.lower())

    if not words:
        return [], [], [], []

    # Glove-based
    word_vectors = []
    good_words_glove = []
    for word in words:
        if word in glove_embeddings:
            word_vectors.append(glove_embeddings[word])
            good_words_glove.append(word)
        else:
            print(f"Skipping '{word}' - not in GloVe word list")

    glove_outliers = []
    if word_vectors:
        similarities = cosine_similarity(word_vectors)
        avg_similarities = []
        for i in range(len(good_words_glove)):
            other_sims = [similarities[i][j] for j in range(len(good_words_glove)) if i != j]
            avg_sim = np.mean(other_sims) if other_sims else 0
            avg_similarities.append(avg_sim)

        mean_similarity = np.mean(avg_similarities)
        if mean_similarity > 0.7:  # Trial-and-error
            print("High semantic coherence - likely no metaphors")
            return "glove", [], "wordnet", [], "sbert", []

        # outlier detection
        avg_all = np.mean(avg_similarities)
        std_all = np.std(avg_similarities)
        cutoff = avg_all - std_all
        for i in range(len(good_words_glove)):
            if avg_similarities[i] < cutoff:
                glove_outliers.append((good_words_glove[i], avg_similarities[i]))

    # wordnet
    good_words_wordnet = []
    for word in words:
        if wn.synsets(word):
            good_words_wordnet.append(word)
        else:
            print(f"'{word}'not in wordnet")

    wordnet_outliers = []
    if len(good_words_wordnet) > 1:
        # Calculate pairwise WordNet similarities
        similarities = []
        for i, word1 in enumerate(good_words_wordnet):
            sim_row = []
            syn1 = wn.synsets(word1)[0]  # Take first synset for simplicity
            for j, word2 in enumerate(good_words_wordnet):
                if i == j:
                    sim_row.append(1.0)  # Same word, max similarity
                    continue
                syn2 = wn.synsets(word2)[0]
                # Compute Wu-Palmer similarity (returns None if no path)
                sim = syn1.wup_similarity(syn2)
                sim_row.append(sim if sim is not None else 0.0)
            similarities.append(sim_row)

        # Average similarities for each word
        avg_similarities = []
        for i in range(len(good_words_wordnet)):
            other_sims = [similarities[i][j] for j in range(len(good_words_wordnet)) if i != j]
            avg_sim = np.mean(other_sims) if other_sims else 0
            avg_similarities.append(avg_sim)

        # Find cutoff for "outlier" words
        avg_all = np.mean(avg_similarities)
        std_all = np.std(avg_similarities)
        cutoff = avg_all - std_all

        # Pick out outlier words
        for i in range(len(good_words_wordnet)):
            if avg_similarities[i] < cutoff:
                wordnet_outliers.append((good_words_wordnet[i], avg_similarities[i]))

    # --- SBERT-based outlier detection ---
    model = SentenceTransformer("all-MiniLM-L6-v2")
    full_sentence = " ".join(words)
    sbert_outliers = []
    if full_sentence.strip():
        full_embedding = model.encode(full_sentence)

        for i, word in enumerate(words):
            leave_one_out = [w for j, w in enumerate(words) if j != i]
            loo_sentence = " ".join(leave_one_out)

            if loo_sentence.strip():
                loo_embedding = model.encode(loo_sentence)
                similarity = cosine_similarity([full_embedding], [loo_embedding])[0][0]
                if similarity < 0.8:  # Tune this threshold
                    sbert_outliers.append((word, similarity))
    else:
        print("No valid SBERT sentence!")

    return "glove", glove_outliers, "wordnet", wordnet_outliers, "sbert", sbert_outliers

In [None]:
sentence = ("Hope is the last candle in the dark.")

In [None]:
find_outlier_words(sentence)

('glove',
 [('candle', np.float32(0.28746852))],
 'wordnet',
 [('candle', np.float64(0.11805555555555555))],
 'sbert',
 [('candle', np.float32(0.6933816))])