In [1]:
import os
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [3]:
# Read data from the 'txt' folder
def read_data_from_folder(folder_path):
    texts = []
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
    return texts

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

# Train the Word2Vec model
def train_word2vec_model(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    model.train(sentences, total_examples=len(sentences), epochs=10)
    return model

# Find related words and bigrams
def find_related_words_and_bigrams(model, phraser, words_and_bigrams, top_n=20):
    for word_or_bigram in words_and_bigrams:
        if ' ' in word_or_bigram:
            splitted = word_or_bigram.split(' ')
            if phraser[splitted] == splitted:
                print(f"Bigram '{word_or_bigram}' not found in the model.")
                continue
            else:
                word_or_bigram = '_'.join(splitted)
        try:
            print(f"\nRelated words for '{word_or_bigram.replace('_', ' ')}':")
            for word, similarity in model.wv.most_similar(word_or_bigram, topn=top_n):
                print(f"{word.replace('_', ' ')}: {similarity}")
        except KeyError:
            print(f"Word or bigram '{word_or_bigram.replace('_', ' ')}' not found in the model.")


def train_model_from_folder(folder_path):
    texts = read_data_from_folder(folder_path)
    sentences = [preprocess_text(text) for text in texts]

    # Create bigrams and update sentences
    phrases = Phrases(sentences, min_count=1, threshold=10)
    phraser = Phraser(phrases)
    bigram_sentences = [phraser[sentence] for sentence in sentences]

    model = train_word2vec_model(bigram_sentences)
    return model, phraser


climate: [('cc', 0.6040445566177368), ('behavioural', 0.5667223334312439), ('transformational', 0.5275135636329651), ('systemic', 0.5159832239151001), ('behavioral', 0.5061660408973694), ('climatic', 0.48612093925476074), ('theory', 0.4787351191043854), ('snrm', 0.44460588693618774), ('changes14', 0.442379355430603), ('behaviour', 0.43425530195236206), ('cis', 0.4311087429523468), ('adapt', 0.42795413732528687), ('glof', 0.41736578941345215), ('transformative', 0.41529393196105957), ('livelihoods', 0.41375717520713806), ('glofs', 0.41179803013801575), ('combine', 0.40862709283828735), ('catalysts', 0.406525194644928), ('strategies', 0.40532392263412476), ('agriculture', 0.40394705533981323), ('compounding', 0.39950883388519287), ('drr', 0.39924854040145874), ('mitigating', 0.3978663384914398), ('innovativedevelopment', 0.3872810900211334), ('mitigate', 0.38679689168930054), ('disaster', 0.38476285338401794), ('coastal', 0.3845866918563843), ('adaptability', 0.38023024797439575), ('adap

In [None]:
folder_path = 'txt'
model, phraser = train_model_from_folder(folder_path)

In [None]:
input_words_and_bigrams = ['climate', 'climate change', 'objectives', 'objective']
find_related_words_and_bigrams(model, phraser, input_words_and_bigrams)