In [22]:
import requests
import pandas as pd

# Load the word dictionary CSV file
word_dict_path = 'expanded_word_dict_1.csv'
word_dict_df = pd.read_csv(word_dict_path, encoding='utf-8')

# Convert the CSV data into a dictionary format for easy processing
seed_words = {col: word_dict_df[col].dropna().tolist() for col in word_dict_df.columns}

# Function to expand words using Datamuse API, retrieving only top N similar words for each seed word
def expand_words_with_datamuse(seed_words, top_n_similar=5):
    expanded_words = {}
    for category, words in seed_words.items():
        expanded_set = set(words)  # Start with the original words in a set
        for word in words:
            # Query the Datamuse API for top N words similar in meaning (ml) and synonyms (rel_syn)
            response_ml = requests.get(f"https://api.datamuse.com/words?ml={word}&max={top_n_similar}")
            response_syn = requests.get(f"https://api.datamuse.com/words?rel_syn={word}&max={top_n_similar}")

            if response_ml.status_code == 200:
                similar_words_ml = response_ml.json()
                expanded_set.update([w['word'] for w in similar_words_ml[:top_n_similar]])

            if response_syn.status_code == 200:
                similar_words_syn = response_syn.json()
                expanded_set.update([w['word'] for w in similar_words_syn[:top_n_similar]])

        # Convert expanded set to list to preserve the original and expanded words
        expanded_words[category] = list(expanded_set)
    return expanded_words

# Expand seed words using the Datamuse API with each word retrieving only the top N similar words
expanded_seed_words = expand_words_with_datamuse(seed_words, top_n_similar=2)

# Combine original and expanded words for each category
final_words = {}
for category, words in seed_words.items():
    # Combine original words with expanded words and remove duplicates
    combined_words = list(set(words + expanded_seed_words[category]))
    final_words[category] = combined_words

# Create a DataFrame with the combined words
final_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in final_words.items()]))

# Save the combined words back to the original CSV file
final_df.to_csv('expanded_word_dict_2.csv', index=False, encoding='utf-8-sig')
