In [7]:
from gensim.models import KeyedVectors
import numpy as np
import os

In [2]:
model = KeyedVectors.load(os.path.join("w2v", "model.model"))

In [20]:
positive_food = []
negative_food = []
positive_service = []
negative_service = []

with open(os.path.join("seeds", "Food_words.txt"), "r") as fd:
    food_words = fd.read().split("\n")

for word in food_words:
    line = word.split()
    if line[-1] == "1":
        positive_food.append(model[line[1]])
    else:
        negative_food.append(model[line[1]]) 

with open(os.path.join("seeds", "Service_words.txt"), "r") as fd:
    food_words = fd.read().split("\n")

for word in food_words:
    line = word.split()
    if line[-1] == "1":
        positive_service.append(model[line[1]])
    else:
        negative_service.append(model[line[1]]) 
        
center_pos_food = np.mean(positive_food, axis=0)
center_neg_food = np.mean(negative_food, axis=0)
center_pos_service = np.mean(positive_service, axis=0)
center_neg_service = np.mean(negative_service, axis=0)

In [22]:
print(center_pos_food.shape)
print(center_neg_food.shape)
print(center_pos_service.shape)
print(center_neg_service.shape)

(300,)
(300,)
(300,)
(300,)


In [24]:
sem_axis_food = center_pos_food - center_neg_food
sem_axis_service = center_pos_service - center_neg_service

In [32]:
from scipy.spatial.distance import cosine

print(cosine(model["вкусно"], sem_axis_food))

1.210892215371132
1.2458789944648743


In [36]:
class WeightedWord:
    def __init__(self, word, cosine, pos):
        self.word = word
        self.cosine = cosine
        self.pos = pos
        
    def __lt__(self, other):
        return self.cosine < other.cosine
    
    def __eq__(self, other):
        return self.cosine == other.cosine

In [52]:
from conllu import parse
from tqdm import tqdm

weighted_food = []
processed_tokens = []
weighted_service = []

allowed_tags = ["VERB", "NOUN", "ADJ"]

for file in tqdm(os.listdir("parsed_train")):
    fd = open(os.path.join("parsed_train", file), "r")
    conllu_text = parse(fd.read())
    for sentence in conllu_text:
        for word in sentence:
            if word["lemma"] in processed_tokens:
                continue
            
            if word["upostag"] in allowed_tags:
                weighted_food.append(WeightedWord(word["lemma"], 
                                                 cosine(model[word["form"]], sem_axis_food),
                                                 word["upostag"]
                                                ))
                weighted_service.append(WeightedWord(word["lemma"], 
                                                    cosine(model[word["form"]], sem_axis_service),
                                                    word["upostag"]))
                processed_tokens.append(word["lemma"])

100%|██████████| 19034/19034 [18:30<00:00, 17.14it/s]


In [53]:
sorted_food = sorted(weighted_food)
sorted_service = sorted(weighted_service)

In [54]:
with open(os.path.join("word_lists", "semantic_axis_method", "food.csv"), "w") as final_fd:
    for word in sorted_food:
        final_fd.write(word.word + "\t" + str(word.cosine) + "\n")
        
with open(os.path.join("word_lists", "semantic_axis_method", "service.csv"), "w") as final_fd:
    for word in sorted_service:
        final_fd.write(word.word + "\t" + str(word.cosine) + "\n")