In [52]:
import gensim
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [53]:
def load_stop(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords_list = file.read().splitlines()
    return stopwords_list

In [54]:
def load_stopwords(file_path):
    with open(file_path, 'r') as file:
        stopwords_list = file.read().splitlines()
    return stopwords_list

def custom_tokenizer(text):
    stopwords_list = load_stop("./stopwords-en.txt")
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    return filtered_tokens

def identify_pos(tokens):
    # Tag tokens with parts of speech
    pos_tags = pos_tag(tokens)
    pos_list = []
    for word, tag in pos_tags:
        if tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('NN'):
            pos_list.append(word)
    return pos_list


In [102]:
df = pd.read_excel("./dataset/pos.xlsx")
df = df[["Combined"]]
df

Unnamed: 0,Combined
0,Description : Process the deletion of a single...
1,Description : Process the deletion of multiple...
2,Description : Process the deletion of a record...
3,Description : Process the deletion of a record...
4,Description : Process the deletion of a record...
...,...
8163,Description Verify that the article summary is...
8164,Description Verify that the article summary is...
8165,Description Verify that the article summary is...
8166,Description Verify that the article summary is...


In [71]:
def process_row(row):
    text = row['Description']
    filtered_tokens = custom_tokenizer(text)
    words = identify_pos(filtered_tokens)
    row['Imp Words'] = words
    return row

In [103]:
df = df.Combined.apply(gensim.utils.simple_preprocess)
df

0       [description, process, the, deletion, of, sing...
1       [description, process, the, deletion, of, mult...
2       [description, process, the, deletion, of, reco...
3       [description, process, the, deletion, of, reco...
4       [description, process, the, deletion, of, reco...
                              ...                        
8163    [description, verify, that, the, article, summ...
8164    [description, verify, that, the, article, summ...
8165    [description, verify, that, the, article, summ...
8166    [description, verify, that, the, article, summ...
8167    [description, verify, that, the, article, summ...
Name: Combined, Length: 8168, dtype: object

In [86]:
df = df.drop(["Description"],axis=1)

In [104]:
df

0       [description, process, the, deletion, of, sing...
1       [description, process, the, deletion, of, mult...
2       [description, process, the, deletion, of, reco...
3       [description, process, the, deletion, of, reco...
4       [description, process, the, deletion, of, reco...
                              ...                        
8163    [description, verify, that, the, article, summ...
8164    [description, verify, that, the, article, summ...
8165    [description, verify, that, the, article, summ...
8166    [description, verify, that, the, article, summ...
8167    [description, verify, that, the, article, summ...
Name: Combined, Length: 8168, dtype: object

In [75]:
print(df.loc[0])

Imp Words    [Process, deletion, single, record]
Name: 0, dtype: object


Model

In [105]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [106]:
model.build_vocab(df, progress_per=1000)

In [107]:
model.train(df, total_examples=model.corpus_count, epochs=model.epochs)

(1083174, 1513205)

In [108]:
model.save("./w2vec.model")

Testing the Model

In [124]:
model.wv.most_similar("how")

[('questions', 0.9761815667152405),
 ('learn', 0.9688206315040588),
 ('help', 0.966419517993927),
 ('recipe', 0.962226390838623),
 ('develop', 0.9612542986869812),
 ('healthy', 0.9594994187355042),
 ('assistive', 0.9590986371040344),
 ('nearest', 0.9588932394981384),
 ('guidance', 0.9577454328536987),
 ('abnormal', 0.9568710923194885)]

In [122]:
model.wv.similarity(w1="design", w2="page")

0.37742934

Load a Model

In [120]:
from gensim.models import Word2Vec
m = Word2Vec.load("./w2vec.model")