In [49]:
seed=42
embedding_dim = 100
vocab_size = 12112
lower = False
FN0 = 'train_data'
FN = 'vocabulary-embedding'

In [50]:
import _pickle as pickle
with open('data/polar_data.pkl', 'rb') as contentFile:
    (positive_X, positive_Y, negative_X, negative_Y) = pickle.load(contentFile)

In [51]:
print(len(positive_X), len(positive_Y), len(negative_X), len(negative_Y))

800 800 800 800


In [52]:
from collections import Counter
from itertools import chain
def get_vocab(lst):
    vocabcount = Counter(w for txt in lst for w in txt)
    vocab = list(map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1])))
    return vocab, vocabcount

In [53]:
vocab, vocabcount = get_vocab(positive_X+negative_X)

In [54]:
print (vocab[:50])
print ('...',len(vocab))

['.', 'the', ',', 'and', 'to', 'I', 'a', 'was', 'in', 'of', 'The', 'for', 'hotel', 'room', 'at', 'it', 'is', 'we', 'that', 'my', 'were', 'with', 'had', 'not', '!', 'on', 'Chicago', 'this', 'but', 'stay', 'very', 'our', 'have', 'We', "n't", 'you', 'they', 'be', 'there', 'would', 'from', 'as', 'are', 'all', 'me', 'staff', 'great', 'so', 'service', 'did']
... 12110


In [55]:
i = 0
for key in vocab:
    print(key, vocabcount[key])
    if(i > 10):
        break
    i += 1

. 13678
the 12806
, 8715
and 7802
to 6702
I 6357
a 6331
was 5986
in 3784
of 3287
The 3145
for 2786


In [56]:
empty = 0 # RNN mask of no data
eos = 1  # end of sentence
start_idx = eos+1 # first real word

In [57]:
def get_idx(vocab, vocabcount):
    word2idx = dict((word, idx+start_idx) for idx,word in enumerate(vocab))
    word2idx['<empty>'] = empty
    word2idx['<eos>'] = eos
    
    idx2word = dict((idx,word) for word,idx in word2idx.items())

    return word2idx, idx2word

In [58]:
word2idx, idx2word = get_idx(vocab, vocabcount)

In [59]:
import os
cwd = os.getcwd()

In [60]:
fname = '/data/glove.6b/glove.6B.%dd.txt'%embedding_dim
fname = cwd + fname

In [61]:
glove_n_symbols = !wc -l {fname}

In [62]:
glove_n_symbols = int(glove_n_symbols[0].split()[0])
glove_n_symbols

400000

In [63]:
import numpy as np
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))
globale_scale=.1
with open(fname, 'r') as fp:
    i = 0
    for l in fp:
        l = l.strip().split()
        w = l[0]
        glove_index_dict[w] = i
        glove_embedding_weights[i,:] = np.array(list(map(float,l[1:])))
        i += 1
glove_embedding_weights *= globale_scale

In [64]:
glove_embedding_weights.std()

0.040815727600190289

In [65]:
for w,i in glove_index_dict.items():
    w = w.lower()
    if w not in glove_index_dict:
        glove_index_dict[w] = i

In [66]:
# generate random embedding with same scale as glove
np.random.seed(seed)
shape = (vocab_size, embedding_dim)
scale = glove_embedding_weights.std()*np.sqrt(12)/2 # uniform and not normal
embedding = np.random.uniform(low=-scale, high=scale, size=shape)
print ('random-embedding/glove scale', scale, 'std', embedding.std())

# copy from glove weights of words that appear in our short vocabulary (idx2word)
c = 0
for i in range(vocab_size):
    w = idx2word[i]
    g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
    if g is None and w.startswith('#'): # glove has no hastags (I think...)
        w = w[1:]
        g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
    if g is not None:
        embedding[i,:] = glove_embedding_weights[g,:]
        c+=1
print ('number of tokens, in small vocab, found in glove and copied to embedding', c,c/float(vocab_size))

random-embedding/glove scale 0.0706949139514 std 0.0407991894667
number of tokens, in small vocab, found in glove and copied to embedding 10752 0.8877146631439894


In [67]:
embedding_size = embedding.shape

In [68]:
embedding_size

(12112, 100)

In [69]:
glove_thr = 0.5

In [70]:
word2glove = {}
for w in word2idx:
    if w in glove_index_dict:
        g = w
    elif w.lower() in glove_index_dict:
        g = w.lower()
    elif w.startswith('#') and w[1:] in glove_index_dict:
        g = w[1:]
    elif w.startswith('#') and w[1:].lower() in glove_index_dict:
        g = w[1:].lower()
    else:
        continue
    word2glove[w] = g

In [71]:
normed_embedding = embedding/np.array([np.sqrt(np.dot(gweight,gweight)) for gweight in embedding])[:,None]

nb_unknown_words = 100

glove_match = []
for w,idx in word2idx.items():
    if idx >= vocab_size-nb_unknown_words and w.isalpha() and w in word2glove:
        gidx = glove_index_dict[word2glove[w]]
        gweight = glove_embedding_weights[gidx,:].copy()
        # find row in embedding that has the highest cos score with gweight
        gweight /= np.sqrt(np.dot(gweight,gweight))
        score = np.dot(normed_embedding[:vocab_size-nb_unknown_words], gweight)
        while True:
            embedding_idx = score.argmax()
            s = score[embedding_idx]
            if s < glove_thr:
                break
            if idx2word[embedding_idx] in word2glove :
                glove_match.append((w, embedding_idx, s)) 
                break
            score[embedding_idx] = -1
glove_match.sort(key = lambda x: -x[2])
print ('# of glove substitutes found', len(glove_match))

# of glove substitutes found 69


In [72]:
for orig, sub, score in glove_match[-20:-10]:
    print (score, orig,'=>', idx2word[sub])

0.606717960349 finagling => surliness
0.597920408107 dimming => dim
0.595313958508 odds => difference
0.582304950447 ticky => tacky
0.574744266935 Yup => ow
0.574744266935 yup => ow
0.571094304521 piddle => checkin
0.567003136042 violently => rudely
0.559965874663 kleenex => diapers
0.559415451509 millionaire => banker


In [73]:
glove_idx2idx = dict((word2idx[w],embedding_idx) for  w, embedding_idx, _ in glove_match)

In [74]:
positive_X = [[word2idx[token] for token in content] for content in positive_X]
len(positive_X)

800

In [75]:
negative_X = [[word2idx[token] for token in content] for content in negative_X]
len(negative_X)

800

In [76]:
from sklearn.cross_validation import train_test_split
positive_X_train, positive_X_test, positive_Y_train, positive_Y_test = train_test_split(positive_X, positive_Y, 
                                                                      test_size=100, random_state=seed)

In [77]:
len(positive_X_train), len(positive_X_test), len(positive_Y_train), len(positive_Y_test)

(700, 100, 700, 100)

In [78]:
from sklearn.cross_validation import train_test_split
negative_X_train, negative_X_test, negative_Y_train, negative_Y_test = train_test_split(negative_X, negative_Y, 
                                                                      test_size=100, random_state=seed)

In [79]:
len(negative_X_train), len(negative_X_test), len(negative_Y_train), len(negative_Y_test)

(700, 100, 700, 100)

In [80]:
print(positive_X_train[0])

[76, 200, 5, 7, 57, 16, 3, 4381, 29, 152, 172, 7, 9, 27, 8, 126, 169, 2, 7, 9, 32, 441, 23, 99, 1455, 5, 21, 200, 32, 134, 53, 1052, 176, 134, 52, 270, 3, 2204, 482, 2, 861, 50, 9, 8, 286, 534, 30, 4, 686, 48, 14, 41, 31, 77, 2]


In [81]:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
maxlen = 100

positive_X_train = pad_sequences(positive_X_train, maxlen=100, value=0.)
negative_X_train = pad_sequences(negative_X_train, maxlen=100, value=0.)

positive_X_test = pad_sequences(positive_X_test, maxlen=100, value=0.)
negative_X_test = pad_sequences(negative_X_test, maxlen=100, value=0.)

positive_Y_train = to_categorical(positive_Y_train, nb_classes=2)
negative_Y_train = to_categorical(negative_Y_train, nb_classes=2)

positive_Y_test = to_categorical(positive_Y_test, nb_classes=2)
negative_Y_test = to_categorical(negative_Y_test, nb_classes=2)

In [82]:
net_positive = tflearn.input_data([None, 100])
net_positive = tflearn.embedding(net_positive, input_dim=vocab_size, output_dim=128)
net_positive = tflearn.lstm(net_positive, 128, dropout=0.8)
net_positive = tflearn.fully_connected(net_positive, 2, activation='softmax')
net_positive = tflearn.regression(net_positive, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

In [83]:
model_positive = tflearn.DNN(net_positive, tensorboard_verbose=0, 
                             best_checkpoint_path = os.getcwd() + '/checkpoint/positive/')

In [36]:
"""net_negative = tflearn.input_data([None, 100])
net_negative = tflearn.embedding(net_negative, input_dim=vocab_size, output_dim=128)
net_negative = tflearn.lstm(net_negative, 128, dropout=0.8)
net_negative = tflearn.fully_connected(net_negative, 2, activation='softmax')
net_negative = tflearn.regression(net_negative, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')"""

In [37]:
"""model_negative = tflearn.DNN(net_negative, tensorboard_verbose=0, 
                             best_checkpoint_path = os.getcwd() + '/checkpoint/negative/')"""

In [38]:
"""model_positive.fit(positive_X_train, positive_Y_train, validation_set=(positive_X_test, positive_Y_test),
                   show_metric=True, n_epoch = 50)"""

'model_positive.fit(positive_X_train, positive_Y_train, validation_set=(positive_X_test, positive_Y_test),\n                   show_metric=True, n_epoch = 50)'

In [39]:
"""model_negative.fit(negative_X_train, negative_Y_train, validation_set=(negative_X_test, negative_Y_test),
                   show_metric=True, n_epoch = 50)"""

Training Step: 549  | total loss: [1m[32m0.01100[0m[0m | time: 1.842s
| Adam | epoch: 050 | loss: 0.01100 - acc: 0.9994 -- iter: 640/700
Training Step: 550  | total loss: [1m[32m0.01052[0m[0m | time: 3.031s
| Adam | epoch: 050 | loss: 0.01052 - acc: 0.9995 | val_loss: 0.99697 - val_acc: 0.8000 -- iter: 700/700
--


In [41]:
"""model_negative.load(os.getcwd() + '/checkpoint/negative/8200')"""

INFO:tensorflow:Restoring parameters from /Users/guanqiaoqian/Documents/machine_learning/fake_news/checkpoint/negative/8200


In [42]:
"""model_negative.evaluate(negative_X_test,negative_Y_test)"""

[0.81999999284744263]

In [84]:
model_positive.load(os.getcwd() + '/checkpoint/positive/8900')

INFO:tensorflow:Restoring parameters from /Users/guanqiaoqian/Documents/machine_learning/fake_news/checkpoint/positive/8900


NotFoundError: Key Accuracy/Mean/moving_avg_1 not found in checkpoint
	 [[Node: save_11/RestoreV2_1 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save_11/Const_0, save_11/RestoreV2_1/tensor_names, save_11/RestoreV2_1/shape_and_slices)]]

Caused by op 'save_11/RestoreV2_1', defined at:
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/guanqiaoqian/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-83-e2e6a27aa426>", line 2, in <module>
    best_checkpoint_path = os.getcwd() + '/checkpoint/positive/')
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tflearn/models/dnn.py", line 65, in __init__
    best_val_accuracy=best_val_accuracy)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tflearn/helpers/trainer.py", line 147, in __init__
    allow_empty=True)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1056, in __init__
    self.build()
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1086, in build
    restore_sequentially=self._restore_sequentially)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 691, in build
    restore_sequentially, reshape)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 407, in _AddRestoreOps
    tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 247, in restore_op
    [spec.tensor.dtype])[0])
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/ops/gen_io_ops.py", line 669, in restore_v2
    dtypes=dtypes, name=name)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/guanqiaoqian/miniconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

NotFoundError (see above for traceback): Key Accuracy/Mean/moving_avg_1 not found in checkpoint
	 [[Node: save_11/RestoreV2_1 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save_11/Const_0, save_11/RestoreV2_1/tensor_names, save_11/RestoreV2_1/shape_and_slices)]]


In [None]:
model_positive.evaluate(positive_X_test,positive_Y_test)