In [16]:
import os
import re
import time
import pickle
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser


def read_data_from_folder(folder_path):
    texts = []
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
    return texts


def preprocess_text(text):
    return re.sub(r'\W+', ' ', text.lower()).split()


def train_word2vec_model(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    model.train(sentences, total_examples=len(sentences), epochs=10)
    return model


def find_related_words_and_bigrams(model, phraser, words_and_bigrams, top_n=30):
    related_words_dict = {}
    for word_or_bigram in words_and_bigrams:
        if ' ' in word_or_bigram:
            splitted = word_or_bigram.split(' ')
            if phraser[splitted] == splitted:
                print(f"Bigram '{word_or_bigram}' not found in the model.")
                continue
            else:
                word_or_bigram = '_'.join(splitted)
        try:
            related_words = model.wv.most_similar(word_or_bigram, topn=top_n)
            related_words_dict[word_or_bigram.replace('_', ' ')] = related_words
        except KeyError:
            print(f"Word or bigram '{word_or_bigram.replace('_', ' ')}' not found in the model.")
    return related_words_dict


def train_model_from_folder(folder_path):
    texts = read_data_from_folder(folder_path)
    sentences = [preprocess_text(text) for text in texts]

    phrases = Phrases(sentences, min_count=1, threshold=10)
    phraser = Phraser(phrases)
    bigram_sentences = [phraser[sentence] for sentence in sentences]

    model = train_word2vec_model(bigram_sentences)
    return model, phraser


def save_to_csv(related_words_dict, file_name='output.tsv'):
    df = pd.DataFrame(related_words_dict)
    for col in df.columns:
        df[col] = df[col].apply(lambda x: f"{x[0].replace('_', ' ')}: {x[1]:.4f}")
    df.to_csv(file_name, index=False, sep='\t', encoding='utf-8')


def save_model(model, phraser, folder_path='./models'):
    os.makedirs(folder_path, exist_ok=True)
    model.save(os.path.join(folder_path, 'word2vec.model'))
    with open(os.path.join(folder_path, 'phraser.pkl'), 'wb') as f:
        pickle.dump(phraser, f)






In [18]:
def main():
    folder_path = "" # ENTER THE FOLDER PATH WITH THE TXT FILES THAT YOU WANT TO USE TO TRAIN W2V
    model, phraser = train_model_from_folder(folder_path)
    save_model(model, phraser)

    input_words = ("").lower().strip().split(',') # ENTER WORDS OR BIGRAMS SEPARATED BY COMMAS
    input_words = [word.strip() for word in input_words]

    related_words_dict = find_related_words_and_bigrams(model, phraser, input_words)
    save_to_csv(related_words_dict, file_name='output.tsv')
        # Print related words
    print("\nRelated words and bigrams:")
    for key, value in related_words_dict.items():
        print(f"{key}:")
        for word, similarity in value:
            print(f"  {word.replace('_', ' ')}: {similarity:.4f}")
        print()


if __name__ == '__main__':
    main()

FileNotFoundError: [WinError 3] The system cannot find the path specified: ''