In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [6]:
# define parameters
VOCAB_SIZE = 3000
EMBEDDING_SIZE = 10
BATCH_SIZE = 8192
EPOCH = 50
WINDOW_SIZE = 1
# name
name = 'fnn-' + 'epo' + str(EPOCH) + 'ebd' + str(EMBEDDING_SIZE) + 'vcb' + str(VOCAB_SIZE) + 'win' + str(WINDOW_SIZE)

In [7]:
# deifine model
class FNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(FNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, inputs):
        embedding = self.embedding(inputs)
        out = F.relu(self.fc1(embedding))
        out = self.fc2(out)
        return F.log_softmax(out, dim=-1)

In [8]:
# read model
model = FNN(VOCAB_SIZE+1, EMBEDDING_SIZE, 128)
model.load_state_dict(torch.load(name + '.pth'))
word_vectors = np.array(model.embedding.weight.data.cpu())

In [11]:
# read corpus from file
corpus = []
with open('norvel-cleaned.txt', 'r', encoding='utf-8') as f:
    for line in f:
        corpus.append(line.strip().split(' '))

# build word_to_ix
word_to_ix = {}
for sentence in corpus:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_to_ix = {k: v if v < VOCAB_SIZE else VOCAB_SIZE for k, v in word_to_ix.items()}
word_to_ix['<UNK>'] = VOCAB_SIZE

In [13]:
word_list = list(word_to_ix.keys())[::10][:20]
# get similar words using cosine similarity
df = pd.DataFrame()
for word in word_list:
    word_vector = word_vectors[word_to_ix[word]]
    similarities = np.dot(word_vectors, word_vector) / (np.linalg.norm(word_vectors, axis=1) * np.linalg.norm(word_vector))
    sorted_indices = np.argsort(similarities)[::-1][1:11]
    similar_words = []
    similarity_scores = []
    for i in sorted_indices:
        similar_word = list(word_to_ix.keys())[list(word_to_ix.values()).index(i)]
        similarity = round(similarities[i], 4)
        similar_words.append(similar_word)
        similarity_scores.append(similarity)
    df[f'{word}'] = similar_words
    df[f'sim2{word}'] = similarity_scores
df.to_excel(f'{name}.xlsx', index=False)