In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Load Word List from Excel ===
excel_path = "/content/Words.xlsx"  # <-- Replace with actual filename
df_words = pd.read_excel(excel_path)

# Assume the column with words is named 'word'
word_list = df_words['Word'].dropna().tolist()

# === Load Pretrained Model and Tokenizer ===
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# === Functions ===
def get_definition(word):
    prompt = f"Define {word}."
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model.transformer(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# === Generate Baseline Definitions and Embeddings ===
results = []
for word in word_list:
    try:
        definition = get_definition(word)
        embedding = get_embedding(word)
        results.append({"word": word, "definition": definition, "embedding": embedding})
    except Exception as e:
        print(f"Failed for word '{word}': {e}")

# === Save Results ===
df_results = pd.DataFrame(results)
df_results.to_pickle("baseline_definitions_embeddings.pkl")
print("Baseline results saved to baseline_definitions_embeddings.pkl")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Failed for word 'True': text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

✅ Baseline results saved to baseline_definitions_embeddings.pkl


In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Load Word List from Excel ===
excel_path = "/content/Words.xlsx"  # Replace with actual filename
df_words = pd.read_excel(excel_path)

# Assume the column with words is named 'Word'
word_list = [str(w) for w in df_words['Word'].dropna() if str(w).isalpha()]

# === Load SmolLM2-135M ===
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# === Functions ===
def get_definition(word):
    prompt = f"Define {word}."
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model.model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


# === Generate Baseline Definitions and Embeddings ===
results = []
for word in word_list:
    try:
        definition = get_definition(word)
        embedding = get_embedding(word)
        results.append({
            "word": word,
            "definition": definition,
            "embedding": embedding.tolist()  # Convert numpy array to list for better serialization
        })
    except Exception as e:
        print(f"Failed for word '{word}': {e}")

# === Save Results ===
df_results = pd.DataFrame(results)
df_results.to_pickle("baseline_definitions_embeddings.pkl")
print("Baseline results saved to baseline_definitions_embeddings.pkl")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for o

Baseline results saved to baseline_definitions_embeddings.pkl


In [None]:
pip install nltk




In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
import pandas as pd
import torch
excel_path = "/content/Words.xlsx"  # Replace with actual filename
df_words = pd.read_excel(excel_path)

In [None]:
def get_wordnet_definition(word):
    synsets = wn.synsets(word)
    if synsets:
        return synsets[0].definition()  # Take the first meaning
    else:
        return None


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("baseline_definitions_embeddings.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding
0,adaxial,Define adaxial.\n\nThe adaxial is the part of ...,"[-0.19158275425434113, 1.1911396980285645, 0.2..."
1,hoggish,Define hoggish.\n\nThe word hoggish is a contr...,"[-0.7336127758026123, 0.6614903807640076, 0.89..."
2,adsorptive,Define adsorptive.\n\nAdsorptive is a process ...,"[-0.2344677895307541, 0.1635841578245163, 0.83..."
3,unreachable,Define unreachable.\n\nThe following example s...,"[-0.25693729519844055, 0.12566016614437103, 0...."
4,wolfish,Define wolfish.\n\nThe word is derived from th...,"[-0.11500249058008194, 0.5398070812225342, 1.0..."
5,handy,Define handy.\n\nThe first thing to do is to m...,"[-0.36915266513824463, 0.4476279020309448, 0.5..."
6,crapulous,Define crapulous.\n\nI'm not sure what you mea...,"[-0.7203152179718018, 0.1639937311410904, 0.40..."
7,adsorbable,Define adsorbable.\n\nThe first step in the pr...,"[-0.586689293384552, 0.5018137693405151, 0.577..."
8,abstentious,Define abstentious.\n\nThe word abstentia is d...,"[-0.3722144365310669, 0.1436983048915863, 0.78..."
9,close,Define close.\n\nThe first thing to do is to g...,"[-0.2758944034576416, 0.7169712781906128, -0.0..."


In [None]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn

# === Setup WordNet ===
nltk.download('wordnet')
nltk.download('omw-1.4')

# === Load Baseline Results ===
df_results = pd.read_pickle("/content/baseline_definitions_embeddings.pkl")

# === WordNet Definition Function ===
def get_wordnet_definition(word):
    synsets = wn.synsets(word)
    if synsets:
        return synsets[0].definition()
    else:
        return None

# === Add WordNet Definitions ===
df_results["wordnet_definition"] = df_results["word"].apply(get_wordnet_definition)

# === Save Updated DataFrame ===
df_results.to_pickle("results_with_wordnet.pkl")
print("WordNet definitions added and saved to results_with_wordnet.pkl")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


WordNet definitions added and saved to results_with_wordnet.pkl


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("results_with_wordnet.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding,wordnet_definition
0,adaxial,Define adaxial.\n\nThe adaxial is the part of ...,"[-0.19158275425434113, 1.1911396980285645, 0.2...",nearest to or facing toward the axis of an org...
1,hoggish,Define hoggish.\n\nThe word hoggish is a contr...,"[-0.7336127758026123, 0.6614903807640076, 0.89...",resembling swine; coarsely gluttonous or greedy
2,adsorptive,Define adsorptive.\n\nAdsorptive is a process ...,"[-0.2344677895307541, 0.1635841578245163, 0.83...",having capacity or tendency to adsorb or cause...
3,unreachable,Define unreachable.\n\nThe following example s...,"[-0.25693729519844055, 0.12566016614437103, 0....",inaccessibly located or situated
4,wolfish,Define wolfish.\n\nThe word is derived from th...,"[-0.11500249058008194, 0.5398070812225342, 1.0...",resembling or characteristic (or considered ch...
5,handy,Define handy.\n\nThe first thing to do is to m...,"[-0.36915266513824463, 0.4476279020309448, 0.5...",United States blues musician who transcribed a...
6,crapulous,Define crapulous.\n\nI'm not sure what you mea...,"[-0.7203152179718018, 0.1639937311410904, 0.40...",suffering from excessive eating or drinking
7,adsorbable,Define adsorbable.\n\nThe first step in the pr...,"[-0.586689293384552, 0.5018137693405151, 0.577...",capable of being adsorbed or accumulated on a ...
8,abstentious,Define abstentious.\n\nThe word abstentia is d...,"[-0.3722144365310669, 0.1436983048915863, 0.78...",self-restraining; not indulging an appetite es...
9,close,Define close.\n\nThe first thing to do is to g...,"[-0.2758944034576416, 0.7169712781906128, -0.0...",the temporal end; the concluding time


In [None]:
import pandas as pd

# === Load the saved DataFrame ===
df = pd.read_pickle("results_with_wordnet.pkl")

# === Clean the definitions ===
def clean_definition(row):
    word = row["word"]
    text = row["definition"]
    return text.replace(f"Define {word}.", "").strip()

# === Apply the cleaning function ===
df["definition_clean"] = df.apply(clean_definition, axis=1)

# === (Optional) View a few rows to check ===
df[["word", "definition", "definition_clean"]].head(10)

# === Save the updated DataFrame ===
df.to_pickle("results_with_wordnet_cleaned.pkl")
print("Cleaned definitions saved to results_with_wordnet_cleaned.pkl")


Cleaned definitions saved to results_with_wordnet_cleaned.pkl


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("results_with_wordnet_cleaned.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding,wordnet_definition,definition_clean
0,adaxial,Define adaxial.\n\nThe adaxial is the part of ...,"[-0.19158275425434113, 1.1911396980285645, 0.2...",nearest to or facing toward the axis of an org...,The adaxial is the part of the root that is cl...
1,hoggish,Define hoggish.\n\nThe word hoggish is a contr...,"[-0.7336127758026123, 0.6614903807640076, 0.89...",resembling swine; coarsely gluttonous or greedy,The word hoggish is a contraction of the word ...
2,adsorptive,Define adsorptive.\n\nAdsorptive is a process ...,"[-0.2344677895307541, 0.1635841578245163, 0.83...",having capacity or tendency to adsorb or cause...,Adsorptive is a process in which a substance i...
3,unreachable,Define unreachable.\n\nThe following example s...,"[-0.25693729519844055, 0.12566016614437103, 0....",inaccessibly located or situated,The following example shows how to use the unr...
4,wolfish,Define wolfish.\n\nThe word is derived from th...,"[-0.11500249058008194, 0.5398070812225342, 1.0...",resembling or characteristic (or considered ch...,"The word is derived from the Latin word wolf, ..."
5,handy,Define handy.\n\nThe first thing to do is to m...,"[-0.36915266513824463, 0.4476279020309448, 0.5...",United States blues musician who transcribed a...,The first thing to do is to make sure that the...
6,crapulous,Define crapulous.\n\nI'm not sure what you mea...,"[-0.7203152179718018, 0.1639937311410904, 0.40...",suffering from excessive eating or drinking,"I'm not sure what you mean by ""crapulous"" but ..."
7,adsorbable,Define adsorbable.\n\nThe first step in the pr...,"[-0.586689293384552, 0.5018137693405151, 0.577...",capable of being adsorbed or accumulated on a ...,The first step in the process is to determine ...
8,abstentious,Define abstentious.\n\nThe word abstentia is d...,"[-0.3722144365310669, 0.1436983048915863, 0.78...",self-restraining; not indulging an appetite es...,The word abstentia is derived from the Latin v...
9,close,Define close.\n\nThe first thing to do is to g...,"[-0.2758944034576416, 0.7169712781906128, -0.0...",the temporal end; the concluding time,The first thing to do is to get the data from ...


In [None]:
pip install sentence-transformers




In [None]:
df.to_excel("results_with_wordnet_cleaned.xlsx", index=False)


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


In [None]:
df = pd.read_pickle("results_with_wordnet_cleaned.pkl")
model = SentenceTransformer("all-MiniLM-L6-v2")
def get_definition_similarity(row):
    def1 = row["definition_clean"]
    def2 = row["wordnet_definition"]

    if pd.isna(def1) or pd.isna(def2):
        return None

    emb1 = model.encode(def1)
    emb2 = model.encode(def2)

    return cosine_similarity([emb1], [emb2])[0][0]  # returns a scalar
df["definition_similarity"] = df.apply(get_definition_similarity, axis=1)

df.to_pickle("results_with_similarity.pkl")
print("Similarity scores saved to results_with_similarity.pkl")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity scores saved to results_with_similarity.pkl


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("results_with_similarity.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding,wordnet_definition,definition_clean,definition_similarity
0,adaxial,Define adaxial.\n\nThe adaxial is the part of ...,"[-0.19158275425434113, 1.1911396980285645, 0.2...",nearest to or facing toward the axis of an org...,The adaxial is the part of the root that is cl...,0.286537
1,hoggish,Define hoggish.\n\nThe word hoggish is a contr...,"[-0.7336127758026123, 0.6614903807640076, 0.89...",resembling swine; coarsely gluttonous or greedy,The word hoggish is a contraction of the word ...,0.418394
2,adsorptive,Define adsorptive.\n\nAdsorptive is a process ...,"[-0.2344677895307541, 0.1635841578245163, 0.83...",having capacity or tendency to adsorb or cause...,Adsorptive is a process in which a substance i...,0.411917
3,unreachable,Define unreachable.\n\nThe following example s...,"[-0.25693729519844055, 0.12566016614437103, 0....",inaccessibly located or situated,The following example shows how to use the unr...,0.290547
4,wolfish,Define wolfish.\n\nThe word is derived from th...,"[-0.11500249058008194, 0.5398070812225342, 1.0...",resembling or characteristic (or considered ch...,"The word is derived from the Latin word wolf, ...",0.647877
5,handy,Define handy.\n\nThe first thing to do is to m...,"[-0.36915266513824463, 0.4476279020309448, 0.5...",United States blues musician who transcribed a...,The first thing to do is to make sure that the...,-0.008771
6,crapulous,Define crapulous.\n\nI'm not sure what you mea...,"[-0.7203152179718018, 0.1639937311410904, 0.40...",suffering from excessive eating or drinking,"I'm not sure what you mean by ""crapulous"" but ...",0.013861
7,adsorbable,Define adsorbable.\n\nThe first step in the pr...,"[-0.586689293384552, 0.5018137693405151, 0.577...",capable of being adsorbed or accumulated on a ...,The first step in the process is to determine ...,0.368522
8,abstentious,Define abstentious.\n\nThe word abstentia is d...,"[-0.3722144365310669, 0.1436983048915863, 0.78...",self-restraining; not indulging an appetite es...,The word abstentia is derived from the Latin v...,0.215657
9,close,Define close.\n\nThe first thing to do is to g...,"[-0.2758944034576416, 0.7169712781906128, -0.0...",the temporal end; the concluding time,The first thing to do is to get the data from ...,-0.117283


In [None]:
print("Average similarity:", df["definition_similarity"].mean())


Average similarity: 0.16503696


In [None]:
import pandas as pd

# Load the .pkl file
df_baseline = pd.read_pickle("/content/results_with_similarity (1).pkl")

# View first few rows
df_baseline.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding,wordnet_definition,definition_clean,definition_similarity
0,adaxial,Define adaxial.\n\nThe adaxial is the part of ...,"[-0.19158275425434113, 1.1911396980285645, 0.2...",nearest to or facing toward the axis of an org...,The adaxial is the part of the root that is cl...,0.286537
1,hoggish,Define hoggish.\n\nThe word hoggish is a contr...,"[-0.7336127758026123, 0.6614903807640076, 0.89...",resembling swine; coarsely gluttonous or greedy,The word hoggish is a contraction of the word ...,0.418394
2,adsorptive,Define adsorptive.\n\nAdsorptive is a process ...,"[-0.2344677895307541, 0.1635841578245163, 0.83...",having capacity or tendency to adsorb or cause...,Adsorptive is a process in which a substance i...,0.411917
3,unreachable,Define unreachable.\n\nThe following example s...,"[-0.25693729519844055, 0.12566016614437103, 0....",inaccessibly located or situated,The following example shows how to use the unr...,0.290547
4,wolfish,Define wolfish.\n\nThe word is derived from th...,"[-0.11500249058008194, 0.5398070812225342, 1.0...",resembling or characteristic (or considered ch...,"The word is derived from the Latin word wolf, ...",0.647877
5,handy,Define handy.\n\nThe first thing to do is to m...,"[-0.36915266513824463, 0.4476279020309448, 0.5...",United States blues musician who transcribed a...,The first thing to do is to make sure that the...,-0.008771
6,crapulous,Define crapulous.\n\nI'm not sure what you mea...,"[-0.7203152179718018, 0.1639937311410904, 0.40...",suffering from excessive eating or drinking,"I'm not sure what you mean by ""crapulous"" but ...",0.013861
7,adsorbable,Define adsorbable.\n\nThe first step in the pr...,"[-0.586689293384552, 0.5018137693405151, 0.577...",capable of being adsorbed or accumulated on a ...,The first step in the process is to determine ...,0.368522
8,abstentious,Define abstentious.\n\nThe word abstentia is d...,"[-0.3722144365310669, 0.1436983048915863, 0.78...",self-restraining; not indulging an appetite es...,The word abstentia is derived from the Latin v...,0.215657
9,close,Define close.\n\nThe first thing to do is to g...,"[-0.2758944034576416, 0.7169712781906128, -0.0...",the temporal end; the concluding time,The first thing to do is to get the data from ...,-0.117283


In [None]:
baseline_mean = df_baseline["definition_similarity"].mean()
print("Baseline mean similarity:", baseline_mean)

Baseline mean similarity: 0.16503696


In [None]:
import pandas as pd

# Load the .pkl file
df_finetune = pd.read_pickle("/content/finetuned_results_with_similarity.pkl")

# View first few rows
df_finetune.head(10) # or use display(df.head()) in Jupyter


Unnamed: 0,word,definition,embedding,wordnet_definition,definition_similarity
0,adaxial,and adaxial-to-axial\n\nThe adaxial-to-axial (...,"[0.003112945, -0.09093872, -0.059557665, -0.03...",nearest to or facing toward the axis of an org...,0.371805
1,hoggish,", and the\nlatter is a very good one.\n\n""I am...","[-0.03333829, 0.060048886, -0.008660276, 0.024...",resembling swine; coarsely gluttonous or greedy,0.015714
2,adsorptive,", and the\nadhesion of the particles to the su...","[-0.013217527, -0.073710896, 0.061888024, 0.02...",having capacity or tendency to adsorb or cause...,0.524997
3,unreachable,code.\n\nThe following example shows how to us...,"[-0.008182717, 0.008570292, -0.10269752, 0.007...",inaccessibly located or situated,0.091159
4,wolfish,", but I'm not sure I'm going to be able to get...","[-0.059655532, 0.0042584674, -0.03385436, -0.0...",resembling or characteristic (or considered ch...,-0.029516
5,handy,", and you can use it to make a\nvery useful to...","[-0.013372629, -0.03681865, -0.118486434, -0.0...",United States blues musician who transcribed a...,-0.013047
6,crapulous,",\n\nI'm not going to say that I'm not going t...","[0.015438637, 0.026618456, 0.023370242, -0.042...",suffering from excessive eating or drinking,0.090278
7,adsorbable,to the surface of the surface of the surface o...,"[-0.033868477, 0.010993193, 0.03371046, -0.024...",capable of being adsorbed or accumulated on a ...,0.332096
8,abstentious,", and the\nrest, the more they are to be kept ...","[0.01224741, 0.034539957, -0.06593984, 0.01921...",self-restraining; not indulging an appetite es...,0.204386
9,close,The first thing to do is to get the data.\n\nT...,"[-0.023487054, 0.02879343, -0.060848415, -0.01...",the temporal end; the concluding time,0.01293


In [None]:
df_finetune_mean = df_finetune["definition_similarity"].mean()
print("Baseline mean similarity:", df_finetune_mean)

Baseline mean similarity: 0.16948967


In [None]:
print(f"Improvement: {df_finetune_mean - baseline_mean:.4f}")

Improvement: 0.0045


In [None]:
import pandas as pd

# Load both dataframes
df_base = pd.read_pickle("/content/results_with_similarity (1).pkl")
df_finetune = pd.read_pickle("/content/finetuned_results_with_similarity.pkl")

# Drop rows with missing similarity values
df_base = df_base.dropna(subset=["definition", "definition_similarity"])
df_finetune = df_finetune.dropna(subset=["definition", "definition_similarity"])

# Merge on a common key — assuming it's "word"
df_compare = pd.merge(df_base, df_finetune, on="word", suffixes=("_baseline", "_finetuned"))

# Calculate gain
df_compare["similarity_gain"] = df_compare["definition_similarity_finetuned"] - df_compare["definition_similarity_baseline"]

# Calculate similarity gain
df_compare["gain"] = df_compare["definition_similarity_finetuned"] - df_compare["definition_similarity_baseline"]

# Calculate percentage of words with improved similarity
percent_improved = (df_compare["gain"] > 0).mean() * 100

# Print result
print(f"Words with improved similarity after fine-tuning: {percent_improved:.2f}%")



Words with improved similarity after fine-tuning: 50.82%
