In [1]:
import numpy as np
import pickle
from gensim.models import KeyedVectors

In [2]:
wv = KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin',
                       binary=True,
                       limit=500000)

In [3]:
with open('../Data/count_dicts_nd.pkl', 'rb') as wc_file:
    unpickler = pickle.Unpickler(wc_file)
    count_dicts = unpickler.load()

In [4]:
len(count_dicts)

13860

In [5]:
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse = True)
X = v.fit_transform(count_dicts)

In [6]:
X

<13860x187655 sparse matrix of type '<type 'numpy.float64'>'
	with 9626548 stored elements in Compressed Sparse Row format>

In [7]:
import util


In [8]:
def get_embed(word):
    if word in wv:
        return wv[word]
    else:
        return np.zeros((300))
    
def represented_word(word):
    return 1 if word in wv else 0

In [11]:
with open('../Data/feature_names.pkl', 'rb') as fn_file:
    unpickler = pickle.Unpickler(fn_file)
    feature_names = unpickler.load()

In [12]:
embeddings = [get_embed(word) for word in feature_names]
represented = [represented_word(word) for word in feature_names]

In [13]:
def is_english_word(word):
    return word_list.check(word)

In [14]:
nameset = set(feature_names)

name_filter = [name in nameset for name in v.get_feature_names()]

In [15]:
indices = np.array(name_filter).nonzero()[0]
X_words = X[:, indices]
# X[:, np.nonzero(name_filter)]

In [16]:
X_words

<13860x36124 sparse matrix of type '<type 'numpy.float64'>'
	with 7561941 stored elements in Compressed Sparse Row format>

In [17]:
X_mentions = X_words != 0
df = np.sum(X_mentions, axis=0)
N = X_mentions.shape[1]

idf = np.log(1 + N / df).T
idf = np.array(idf)

In [94]:
sum_embed = X_words.dot(np.array(embeddings) * idf)
num_words = X_words.dot(np.array(represented).reshape((-1, 1)) * idf)
num_words = np.reshape(num_words, (-1, 1))
case_embeddings = sum_embed / num_words

In [93]:
'''
# No idf

sum_embed = X_words.dot(np.array(embeddings))
num_words = X_words.dot(np.array(represented).reshape((-1, 1)))
num_words = np.reshape(num_words, (-1, 1))
case_embeddings = sum_embed / num_words
'''

'\n# No idf\n\nsum_embed = X_words.dot(np.array(embeddings))\nnum_words = X_words.dot(np.array(represented).reshape((-1, 1)))\nnum_words = np.reshape(num_words, (-1, 1))\ncase_embeddings = sum_embed / num_words\n'

In [95]:
case_embeddings_norm = np.sqrt(np.sum(np.square(case_embeddings), axis=1))
len(case_embeddings_norm)
target_embedding_norm = np.linalg.norm(case_embeddings[0])

In [96]:
case_embeddings_norm * target_embedding_norm

array([0.67371021, 0.67470346, 0.67293766, ..., 0.63477583, 0.66408001,
       0.66534062])

In [97]:
case_embeddings_norm

array([0.82079852, 0.82200862, 0.8198573 , ..., 0.77336376, 0.8090658 ,
       0.81060163])

In [98]:
target_embedding_norm

0.8207985190129089

In [99]:
case_embeddings.dot(case_embeddings[0].T)[0:20]

array([0.67371021, 0.55570604, 0.63669457, 0.57700418, 0.57444322,
       0.6149778 , 0.57077228, 0.57814095, 0.62303341, 0.59810046,
       0.55048229, 0.62227645, 0.62557542, 0.63834776, 0.64232047,
       0.61896799, 0.60193479, 0.6083796 , 0.64691983, 0.59137083])

In [100]:
cosine_sim = case_embeddings.dot(case_embeddings[0].T) / (case_embeddings_norm * target_embedding_norm)

np.argsort(cosine_sim)[-50:]


array([ 1303,  7426, 13207,  1757, 12931,  3088,  7667,  8668, 13368,
        1287,  2040, 10371,  3546,  9752, 10119, 13570,  9117,  1830,
        5492,  4731,   890, 12297,  9855,  4125,  8161,  3920,  7521,
        7301,  5843,  3999,  1510,  4641,  7660,  7706,  4890, 11349,
        2096, 11858,   959,  4340,  2488,  8541,  9839,  5424,  5274,
        3104, 12247,  9675,  6374,     0])

In [32]:
import os 
data_dir = '../Data/nd'
filenames = os.listdir(data_dir)
print(filenames[0], filenames[6374])

('1647880.pkl', '2747446.pkl')


In [33]:
idf

array([[ 1.09861229],
       [ 6.03068526],
       [10.49474043],
       ...,
       [10.49474043],
       [ 9.10852911],
       [ 9.80162093]])

In [135]:
feature_names

[u'a',
 u'ab',
 u'abaft',
 u'abandon',
 u'abandoned',
 u'abandoning',
 u'abandonment',
 u'abandons',
 u'abashed',
 u'abate',
 u'abated',
 u'abatement',
 u'abatements',
 u'abates',
 u'abating',
 u'abattoir',
 u'abbey',
 u'abbeys',
 u'abbot',
 u'abbreviate',
 u'abbreviated',
 u'abbreviates',
 u'abbreviating',
 u'abbreviation',
 u'abbreviations',
 u'abdicate',
 u'abdicated',
 u'abdicates',
 u'abdicating',
 u'abdication',
 u'abdomen',
 u'abdominal',
 u'abduct',
 u'abducted',
 u'abducting',
 u'abduction',
 u'abductions',
 u'abductor',
 u'abducts',
 u'abed',
 u'aberrant',
 u'aberration',
 u'aberrational',
 u'aberrations',
 u'abet',
 u'abets',
 u'abetted',
 u'abetting',
 u'abettor',
 u'abettors',
 u'abeyance',
 u'abhor',
 u'abhorrent',
 u'abhors',
 u'abide',
 u'abides',
 u'abiding',
 u'abilities',
 u'ability',
 u'abject',
 u'abjures',
 u'ablaze',
 u'able',
 u'abler',
 u'ables',
 u'ablest',
 u'ably',
 u'abnormal',
 u'abnormalities',
 u'abnormality',
 u'abnormally',
 u'aboard',
 u'abode',
 u'ab

In [123]:
for pkl in ['1647880.pkl', '2747446.pkl']:
    pklf = os.path.join(data_dir, pkl)
    with open(pklf, "rb") as pickle_file:
        unpickler = pickle.Unpickler(pickle_file)
        print(unpickler.load().html)
    for i in range(20):
        print 'DIVIDER!'


<html><head></head><body><div>
<center><b><span class="citation no-link"><span class="volume">477</span> <span class="reporter">N.W.2d</span> <span class="page">230</span></span> (1991)</b></center>
<center><h1>Tom PETERSON, Plaintiff, Appellant and Cross-Appellee,<br/>
v.<br/>
Kurt ZERR, Defendant and Appellee,<br/>
Interstate Investments, a North Dakota Partnership, Defendant, Appellee and Cross-Appellant.</h1></center>
<center>Civ. No. 910012.</center>
<center><p><b>Supreme Court of North Dakota.</b></p></center>
<center>November 12, 1991.</center>
<p><span class="star-pagination">*232</span> Alan C. Erickson (argued), Bismarck, for plaintiff, appellant and cross-appellee.</p>
<p>Dale W. Moench (argued) of Moench Law Firm, Bismarck, for defendant and appellee Kurt Zerr.</p>
<p>Dean F. Bard (argued) of Bard &amp; Bard, Bismarck, for defendant, appellee, and cross-appellant Interstate Investments. Appearance by John W. Kavaney, partner in Interstate Investments.</p>
<p>GIERKE, Justice

In [67]:
np.array(embeddings).shape
np.array(overused).shape

(36124,)

In [26]:
util.vectorize_count_dict(count_dicts[0]).toarray()

array([[57.,  0.,  0., ...,  0.,  0.,  0.]])

In [18]:
fn = v.get_feature_names()



[u'0',
 u'00',
 u'000',
 u'0000',
 u'00000',
 u'0001',
 u'0001666',
 u'0002',
 u'0003333',
 u'0005',
 u'0006666',
 u'0008333',
 u'0009',
 u'00094',
 u'001',
 u'00100',
 u'001100',
 u'0012',
 u'0013125',
 u'001403',
 u'001404',
 u'0015000',
 u'00163',
 u'002',
 u'00225',
 u'0022969',
 u'0024',
 u'00241',
 u'002436',
 u'00246',
 u'002505015',
 u'003',
 u'003335841',
 u'00351563',
 u'003768',
 u'004',
 u'0040',
 u'0041',
 u'0045900475',
 u'005',
 u'0052',
 u'0055',
 u'006',
 u'00625',
 u'0066',
 u'0067',
 u'007',
 u'0070',
 u'0074',
 u'0075',
 u'007724',
 u'008',
 u'0082',
 u'0085',
 u'0089',
 u'009',
 u'0094',
 u'0098',
 u'00ca2739',
 u'00r0171',
 u'01',
 u'010',
 u'0100',
 u'01000',
 u'0101',
 u'010109',
 u'01011',
 u'01013',
 u'010202',
 u'010210b7',
 u'010226',
 u'0102491',
 u'010273',
 u'010286',
 u'0103',
 u'0104',
 u'0104062',
 u'010415',
 u'010429',
 u'010618',
 u'0107',
 u'0108',
 u'0109',
 u'01092013',
 u'011',
 u'0110',
 u'0110010',
 u'0111',
 u'0111c',
 u'011384',
 u'0114',
 u

In [16]:
feature_names

[u'a',
 u'ab',
 u'abaft',
 u'abandon',
 u'abandoned',
 u'abandoning',
 u'abandonment',
 u'abandons',
 u'abashed',
 u'abate',
 u'abated',
 u'abatement',
 u'abatements',
 u'abates',
 u'abating',
 u'abattoir',
 u'abbey',
 u'abbeys',
 u'abbot',
 u'abbreviate',
 u'abbreviated',
 u'abbreviates',
 u'abbreviating',
 u'abbreviation',
 u'abbreviations',
 u'abdicate',
 u'abdicated',
 u'abdicates',
 u'abdicating',
 u'abdication',
 u'abdomen',
 u'abdominal',
 u'abduct',
 u'abducted',
 u'abducting',
 u'abduction',
 u'abductions',
 u'abductor',
 u'abducts',
 u'abed',
 u'aberrant',
 u'aberration',
 u'aberrational',
 u'aberrations',
 u'abet',
 u'abets',
 u'abetted',
 u'abetting',
 u'abettor',
 u'abettors',
 u'abeyance',
 u'abhor',
 u'abhorrent',
 u'abhors',
 u'abide',
 u'abides',
 u'abiding',
 u'abilities',
 u'ability',
 u'abject',
 u'abjures',
 u'ablaze',
 u'able',
 u'abler',
 u'ables',
 u'ablest',
 u'ably',
 u'abnormal',
 u'abnormalities',
 u'abnormality',
 u'abnormally',
 u'aboard',
 u'abode',
 u'ab