In [None]:
import pandas as pd
import spacy
import requests
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize SentenceTransformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load additional dictionary definitions from Excel file
additional_dict_path = 'QanonMeaning.xlsx'
additional_dict_df = pd.read_excel(additional_dict_path)
additional_dict = dict(zip(additional_dict_df['word'].astype(str), additional_dict_df['meaning'].astype(str)))

def get_embedding(text):
    """Generate embeddings for the text using SentenceTransformer"""
    return sbert_model.encode([text])[0]

def get_sense_embeddings(word):
    sense_embeddings = {}

    # Check if the word is in the additional dictionary
    if word in additional_dict:
        additional_definition = additional_dict[word]
        additional_embedding = get_embedding(additional_definition)
        sense_embeddings[additional_definition] = additional_embedding

    # Get definitions from the dictionary API if not found in the additional dictionary
    url = f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        for meaning in data[0]['meanings']:
            for definition in meaning['definitions']:
                sense_text = definition['definition']
                sense_embedding = get_embedding(sense_text)
                sense_embeddings[sense_text] = sense_embedding
    
    return sense_embeddings

def disambiguate_word(sentence, target_word_regex, additional_words_meanings):
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Find the target words
    target_words = [token.text for token in doc if re.match(target_word_regex, token.text, re.IGNORECASE)]
    
    if not target_words:
        return None, None
    
    best_senses = {}
    best_similarities = {}

    # Include additional words meanings in the context
    context_embeddings = [get_embedding(sentence)]
    for meaning in additional_words_meanings.values():
        context_embeddings.append(get_embedding(meaning))
    context_embedding = sum(context_embeddings) / len(context_embeddings)
    
    for word in target_words:
        # Get embeddings for each sense of the word
        sense_embeddings = get_sense_embeddings(word)
        
        # Calculate cosine similarity between context embedding and each sense embedding
        max_similarity = -1
        best_sense = None
        for definition, embedding in sense_embeddings.items():
            similarity = cosine_similarity([context_embedding], [embedding])[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
                best_sense = definition

        best_senses[word] = best_sense
        best_similarities[word] = max_similarity
    
    return best_senses, best_similarities

# Load data from Excel file
file_path = 'execute.xlsx'
df = pd.read_excel(file_path)

# Process each sentence in the 'dummy_text' column
word_regex = r'execut\w*'  # Regex pattern to match all forms of execute

results = []
similarities = []
for sentence in tqdm(df['dummy_text'], desc="Processing sentences"):
    # Ensure sentence is a string
    sentence = str(sentence)
    
    # Check for additional words and their meanings in the sentence
    additional_words_meanings = {word: additional_dict[word] for word in additional_dict if word in sentence}
    
    best_senses, best_similarities = disambiguate_word(sentence, word_regex, additional_words_meanings)
    
    if best_senses:
        meanings = [f"{word}: {sense}" for word, sense in best_senses.items() if sense]
        execute_meaning = "; ".join(meanings)
        max_similarity = max(best_similarities.values())
    else:
        execute_meaning = "No sense found"
        max_similarity = None
    
    results.append(execute_meaning)
    similarities.append(max_similarity)

# Add the results to the DataFrame
df['execute_meaning'] = results
df['cosine_similarity'] = similarities

# Save the results to a new Excel file
output_file_path = 'execute_meaningsAlongWithCosineSimilarity.xlsx'
df.to_excel(output_file_path, index=False)