In [61]:
"""
Code taken from dmeeting5materials

Read in the entire text of training data from the conLL data
"""
UDWF=1
UDPOS=3
data = [[]]
for line in filter(lambda x: not x or x[0] != '#',
                   map(lambda x:x.strip(), open("./conll_ud_data/en-ud-train.conllu"))):
    if line == '':
        data.append([])
    else:
        fields = line.split()
        data[-1].append((fields[UDWF],fields[UDPOS]))

sentences = []
for s in data:
    sent = []
    for wp in s:
        sent.append(wp[0])
    sentences.append(sent)
    
print(sentences[:2])

[['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], ['[', 'This', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']]


In [62]:
"""
Gensim has a wrapper over Word2Vec with lots of nice features.
Let's use that rather than the word2vec library directly.
"""
from gensim.models import Word2Vec

embedding_model = Word2Vec(sentences, min_count=1, iter=50, size=100, window=5)

In [63]:
#print(len(embedding_model.vocab))
embedding_model.wv.most_similar("terror", topn=20)

[('European', 0.7153034210205078),
 ('incitement', 0.6965106725692749),
 ('civil', 0.6940693855285645),
 ('mainly', 0.6924140453338623),
 ('commitments', 0.6844115853309631),
 ('Islands', 0.6803815364837646),
 ('hiding', 0.6751567125320435),
 ('fabrications', 0.6674426198005676),
 ('increasingly', 0.6657902002334595),
 ('Moslem', 0.6650707721710205),
 ('Andaman', 0.6650658845901489),
 ('examples', 0.658385157585144),
 ('Sunni', 0.6569175720214844),
 ('Usenet', 0.6568467617034912),
 ('Arab', 0.6557535529136658),
 ('planners', 0.6538718342781067),
 ('fallout', 0.6537030935287476),
 ('nations', 0.6519802808761597),
 ('appropriations', 0.6510022878646851),
 ('immigrants', 0.6508307456970215)]

In [64]:
import scipy
from scipy import sparse

"""
Load from the binary file of a ppmi matrix computed with 'ppmi.py'
over the ConLL data, with a window size of 5.
"""
# Load the scipy sparse matrix that was built with the script ppmi.py
ppmi_mat = sparse.load_npz("./data/ppmi_mat_window_5.npz")
ppmi_mat

<19672x19672 sparse matrix of type '<class 'numpy.float64'>'
	with 250558 stored elements in Compressed Sparse Column format>

In [65]:
from sklearn.decomposition import TruncatedSVD
import pickle
"""
Also load the indexing dictionaries created in the ppmi.py script

Perform truncated SVD over the ppmi matrix, using sklearn
"""
with open('./data/wtoi.pkl', 'rb') as f:
    wtoi = pickle.load(f)
        
with open('./data/itow.pkl', 'rb') as f:
    itow = pickle.load(f)
    
svd = TruncatedSVD(n_components=100)
svd.fit(ppmi_mat)

# 100 dimensions for 19672 words
print(svd.components_.shape)

(100, 19672)


In [66]:
import numpy as np
"""
Let's checkout the similarity between vectors
"""

def cosine_sim(X, Y):
    """
    Compute cosine similarity between 2 dense vectors
    
    np.linalg.norm is the L2 norm
    """
    return np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y))

def print_n_sims(vec, n):
    """
    Print the n most similar words to the given vector,
    with their cosine similarities
    """
    # Get a list of tuples (word, sim with terror) where sim is not NaN, based on the cosine_sim function
    sims = [(k, cosine_sim(vec, svd.components_[:, wtoi[k]])) for k in wtoi.keys() if not np.isnan(cosine_sim(vec, svd.components_[:, wtoi[k]]))]
    # Loop over them, sorted
    for word, sim in sorted(sims, key=lambda tup: tup[1], reverse=True)[:n + 1]:
        print(word, sim)

# Do the same test as with W2V
terror = svd.components_[:, wtoi["terror"]]
print_n_sims(terror, 20)

  # Remove the CWD from sys.path while we load stuff.


terror 1.0
intelligence 0.430401799546
Shiite 0.398970722661
technical 0.390630520349
Communist 0.364976705702
Shiites 0.360857569762
army 0.359067203746
rich 0.358742361813
traders 0.357990896903
organizations 0.345930989891
officials 0.341767197667
Nations 0.340519432385
Sunni 0.337045979262
men 0.336337437183
23rd 0.336004461456
trained 0.335652620588
Allawi 0.330355228784
Board 0.330268246303
Iraqi 0.327811314349
Republican 0.325112620153
happiness 0.324619057105


In [None]:
"""
Next let's implement a nerual language model
"""