In [1]:
import nltk
import pandas as pd

This notebook contains code, that I have used to produce data used as input to the scripts to provide answers for the assingment.

### Part 1: Basic operations with word embeddings

Content words from the first sentence of the abstract ot the Chen and Manning *'A fast and Accurate Dependency Parser using Neural Networks'* paper. Lemmatized and tagged with universal POS tags, saved to *words.txt* file, one word per line.

In [66]:
sentence = """Almost all current dependency parsers
classify based on millions of sparse indicator
features"""

wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

words = nltk.word_tokenize(sentence)
words = [w.lower() for w in words]
words = [w for w in words if not w in stop_words]
words = [wordnet_lemmatizer.lemmatize(word) for word in words]
words = ['_'.join(x) for x in nltk.pos_tag(words, tagset='universal')]

with open('../data/words_auto.txt', "w") as f:
    for word in words:
        print(word, file=f)

### Part 2/3: Intrinsic evaluation of pre-trained word embeddings

Add universal POS tags to *Google Analogies* dataset.

In [7]:
with open('../data/analogies/analogies_semantic.txt', "r") as in_f,\
     open('../data/analogies/analogies_semantic_POS.txt', "w") as out_f:
    
    for line in in_f:
        line = line.strip()
        if not line.startswith(':'):
            line = [(word, 'NN') for word in line.split()]
            line = ' '.join(['_'.join(x) for x in line])
        print(line, file=out_f)

Add universal POS tags to *SimLex999*. According to the *SimLex-999: Evaluating Semantic Models
With (Genuine) Similarity Estimation* paper there are 3 categories of POS tags: adjectives, nouns and werbs in corresponding numbers of: 111, 666, 222.

In [3]:
df = pd.read_csv('../data/simlex/simlex.tsv', sep='\t')
df_adj = df.iloc[:111]
df_noun = df.iloc[111:777]
df_verb = df.iloc[777:]

df.columns
df_adj_POS = df_adj.loc[:, ['#word1', 'word2']].apply(lambda x: x + '_JJ').join(df_adj.loc[:, 'SimLex999'])
df_noun_POS = df_noun.loc[:, ['#word1', 'word2']].apply(lambda x: x + '_NN').join(df_noun.loc[:, 'SimLex999'])
df_verb_POS = df_verb.loc[:, ['#word1', 'word2']].apply(lambda x: x + '_VB').join(df_verb.loc[:, 'SimLex999'])
df_POS = pd.concat([df_adj_POS, df_noun_POS, df_verb_POS])

df_adj.to_csv('../data/simlex/simlex_adj.tsv', sep='\t', index=False)
df_noun.to_csv('../data/simlex/simlex_noun.tsv', sep='\t', index=False)
df_verb.to_csv('../data/simlex/simlex_verb.tsv', sep='\t', index=False)

df_adj_POS.to_csv('../data/simlex/simlex_adj_POS.tsv', sep='\t', index=False)
df_noun_POS.to_csv('../data/simlex/simlex_noun_POS.tsv', sep='\t', index=False)
df_verb_POS.to_csv('../data/simlex/simlex_verb_POS.tsv', sep='\t', index=False)

df_POS.to_csv('../data/simlex/simlex_POS.tsv', sep='\t', index=False)