In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Load Word List from Excel ===
excel_path = "/content/Words.xlsx"
df_words = pd.read_excel(excel_path)

word_list = [str(w) for w in df_words['Word'].dropna() if str(w).isalpha()]

# === Load SmolLM2-135M ===
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

def get_definition(word):
    prompt = f"Define {word}."
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model.model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


# === Generate Baseline Definitions and Embeddings ===
results = []
for word in word_list:
    try:
        definition = get_definition(word)
        embedding = get_embedding(word)
        results.append({
            "word": word,
            "definition": definition,
            "embedding": embedding.tolist()
        })
    except Exception as e:
        print(f"Failed for word '{word}': {e}")

# === Save Results ===
df_results = pd.DataFrame(results)
df_results.to_pickle("baseline_definitions_embeddings.pkl")
print("Baseline results saved to baseline_definitions_embeddings.pkl")


In [None]:
pip install nltk


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("baseline_definitions_embeddings.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


In [None]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn

#  WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Baseline Results
df_results = pd.read_pickle("/content/baseline_definitions_embeddings.pkl")

#  WordNet Definition
def get_wordnet_definition(word):
    synsets = wn.synsets(word)
    if synsets:
        return synsets[0].definition()
    else:
        return None

#  Adding WordNet Definitions
df_results["wordnet_definition"] = df_results["word"].apply(get_wordnet_definition)
df_results.to_pickle("results_with_wordnet.pkl")
print("WordNet definitions added and saved to results_with_wordnet.pkl")


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("results_with_wordnet.pkl")

# View first few rows
df.head(10) # or use display(df.head()) in Jupyter


In [None]:
import pandas as pd
df = pd.read_pickle("results_with_wordnet.pkl")

def clean_definition(row):
    word = row["word"]
    text = row["definition"]
    return text.replace(f"Define {word}.", "").strip()
df["definition_clean"] = df.apply(clean_definition, axis=1)

df[["word", "definition", "definition_clean"]].head(10)

df.to_pickle("results_with_wordnet_cleaned.pkl")
print("Cleaned definitions saved to results_with_wordnet_cleaned.pkl")


In [None]:
import pandas as pd

df = pd.read_pickle("results_with_wordnet_cleaned.pkl")

df.head(10)


In [None]:
pip install sentence-transformers


In [None]:
df.to_excel("results_with_wordnet_cleaned.xlsx", index=False)


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


In [None]:
df = pd.read_pickle("results_with_wordnet_cleaned.pkl")
model = SentenceTransformer("all-MiniLM-L6-v2")
def get_definition_similarity(row):
    def1 = row["definition_clean"]
    def2 = row["wordnet_definition"]

    if pd.isna(def1) or pd.isna(def2):
        return None

    emb1 = model.encode(def1)
    emb2 = model.encode(def2)

    return cosine_similarity([emb1], [emb2])[0][0]
df["definition_similarity"] = df.apply(get_definition_similarity, axis=1)

df.to_pickle("results_with_similarity.pkl")
print("Similarity scores saved to results_with_similarity.pkl")


In [None]:
import pandas as pd

# Load the .pkl file
df = pd.read_pickle("results_with_similarity.pkl")

df.head(10)

In [None]:
print("Average similarity:", df["definition_similarity"].mean())


In [None]:
import pandas as pd

# Load the .pkl file
df_baseline = pd.read_pickle("/content/results_with_similarity (1).pkl")

df_baseline.head(10)


In [None]:
baseline_mean = df_baseline["definition_similarity"].mean()
print("Baseline mean similarity:", baseline_mean)

# Loading PPO finetuned file here for comparsion with baseline.
A separate file is used for PPO training, where the model was fine-tuned to generate new definitions and corresponding embeddings. The resulting .pkl file is then used here for comparison.

In [None]:
import pandas as pd

# Load the .pkl file
df_finetune = pd.read_pickle("/content/finetuned_results_with_similarity.pkl")

# View first few rows
df_finetune.head(10) # or use display(df.head()) in Jupyter


In [None]:
df_finetune_mean = df_finetune["definition_similarity"].mean()
print("mean similarity finetune vs wordnet(binary Binary Reward (threshold = 0.85)):", df_finetune_mean)

In [None]:
print(f"Improvement: {df_finetune_mean - baseline_mean:.4f}")

In [None]:
import pandas as pd

# Loading both dataframes
df_base = pd.read_pickle("/content/results_with_similarity (1).pkl")
df_finetune = pd.read_pickle("/content/finetuned_results_with_similarity.pkl")

df_base = df_base.dropna(subset=["definition", "definition_similarity"])
df_finetune = df_finetune.dropna(subset=["definition", "definition_similarity"])

df_compare = pd.merge(df_base, df_finetune, on="word", suffixes=("_baseline", "_finetuned"))

df_compare["similarity_gain"] = df_compare["definition_similarity_finetuned"] - df_compare["definition_similarity_baseline"]

df_compare["gain"] = df_compare["definition_similarity_finetuned"] - df_compare["definition_similarity_baseline"]

percent_improved = (df_compare["gain"] > 0).mean() * 100

# Print result
print(f"Words with improved similarity after fine-tuning: {percent_improved:.2f}%")

