In [76]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from collections import defaultdict, UserDict
import mygrad as mg
from pathlib import Path
import pandas as pd
import re, string

from noggin import create_plot
%matplotlib notebook

In [8]:
unzipped_folder = "glove.twitter.27B/" # ENTER THE PATH TO THE UNZIPPED `glove.twitter.27B` HERE

# use glove2word2vec to convert GloVe vectors in text format into the word2vec text format:
if not Path('gensim_glove_vectors_200.txt').exists():
    
    # assumes you've downloaded and extracted the glove stuff
    glove2word2vec(glove_input_file= unzipped_folder + "glove.twitter.27B.200d.txt", 
               word2vec_output_file="gensim_glove_vectors_200.txt")

# read the word2vec txt to a gensim model using KeyedVectors
glove = KeyedVectors.load_word2vec_format("gensim_glove_vectors_200.txt", binary=False)

In [9]:
from mynn.layers.conv import conv
from mynn.layers.dense import dense
from mynn.activations.relu import relu
from mygrad.nnet.layers import max_pool
from mynn.activations.sigmoid import sigmoid
from mynn.initializers.glorot_normal import glorot_normal
from mynn.optimizers.adam import Adam

class Model:
    def __init__(self):
        """ Initializes model layers and weights. """
        # <COGINST>
        init_kwargs = {'gain': np.sqrt(2)}
        self.conv1 = conv(200, 250, 2, stride = 1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense1 = dense(250, 250, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense2 = dense(250,1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        # </COGINST>
    
    
    def __call__(self, x):
        """ Forward data through the network.
        
        This allows us to conveniently initialize a model `m` and then send data through it
        to be classified by calling `m(x)`.
        
        Parameters
        ----------
        x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, S)
            The data to forward through the network.
            
        Returns
        -------
        mygrad.Tensor, shape=(N, 1)
            The model outputs.
        
        Notes
        -----
        N = batch size
        D = embedding size
        S = sentence length
        """
        # <COGINST>
        # (N, D, S) with D = 200 and S = 77
        x = self.conv1(x) # conv output shape (N, F, S') with F = 250 and S' = 75
        x = relu(x)
        x = max_pool(x, (x.shape[-1],), 1) # global pool output shape (N, F, S') with F = 250, S' = 1
        x = x.reshape(x.shape[0], -1)  # (N, F, 1) -> (N, F)
        x = self.dense1(x) # (N, F) @ (F, D1) = (N, D1)
        x = relu(x) 
        x = self.dense2(x) # (N, D1) @ (D1, 1) = (N, 1)
        x = sigmoid(x)
        return x # output shape (N, 1)
        # </COGINST>
    
    
    @property
    def parameters(self, load = None):
        """ A convenience function for getting all the parameters of our model. """
        return self.conv1.parameters + self.dense1.parameters + self.dense2.parameters # <COGLINE>

In [124]:
def get_embedding(text):
    """ Returns the word embedding for a given word, reshaping the word embedding array. """
    out = []
    for word in text:
        if word not in glove:
            continue
        else:
            out.append(glove.get_vector(word))
    while len(out) < 80:
        out.append([0]*200)
    return np.array(out).T



In [117]:
punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
def strip_punc(text):
    return punc_regex.sub('', text).lower()

In [19]:
test = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ANSI', header=None)

In [60]:
sum(test[0]==2)

0

In [70]:
test[5][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [129]:
sent1 = strip_punc(test[5][0]).split()
sent2 = strip_punc(test[5][1]).split()
print(sent2)

['is', 'upset', 'that', 'he', 'cant', 'update', 'his', 'facebook', 'by', 'texting', 'it', 'and', 'might', 'cry', 'as', 'a', 'result', 'school', 'today', 'also', 'blah']


In [132]:
sent1 = get_embedding(sent1)
sent2 = get_embedding(sent2)
overall = np.concatenate((sent1, sent2), axis = 0).reshape(-1, len(sent1), sent1.shape[1])
overall.shape

(2, 200, 80)

In [133]:
sent1.shape

(200, 80)

In [75]:
'switchfoot' in glove

True

In [84]:
glove['link']

array([ 5.1358e-01,  5.2314e-01, -8.5142e-02, -7.1516e-02, -1.4380e-01,
       -1.0719e+00,  5.7060e-01, -3.7303e-01,  2.6237e-01, -6.4902e-02,
        6.1987e-01,  5.2580e-01,  2.4914e-01, -3.7741e-01, -3.1725e-01,
       -1.9673e-01,  4.1449e-01, -2.7580e-01,  3.3495e-01, -4.7852e-02,
       -5.5645e-01,  4.1157e-01, -6.4329e-01,  5.1918e-01, -6.0383e-01,
       -8.2681e-01,  5.3940e-01,  2.9415e-01, -2.1578e-01,  5.2302e-01,
        4.3424e-01, -6.5430e-02, -3.6149e-01, -3.1745e-01,  7.7248e-02,
       -3.8573e-01,  4.9676e-01, -6.4950e-01, -4.6011e-01, -4.8237e-01,
       -9.2141e-01,  7.5239e-01, -4.9764e-01,  6.8488e-01, -1.6132e-01,
        3.6804e-01, -7.0045e-01, -5.3460e-01, -2.1977e-01,  3.5038e-01,
       -2.6494e-01,  9.9057e-02, -2.9904e-01, -4.6367e-01,  6.1319e-02,
       -6.6864e-01, -6.5722e-01,  4.2323e-01,  4.1458e-01, -7.8972e-02,
       -1.5018e-01, -6.8248e-02,  2.1984e-01,  1.0073e+00, -2.0391e-01,
        3.5555e-01, -1.8128e-01, -6.6556e-02,  4.9067e-01, -1.58

In [85]:
glove.most_similar('link')

[('links', 0.7532831430435181),
 ('click', 0.6905517578125),
 ('check', 0.6621154546737671),
 ('site', 0.6570886969566345),
 ('email', 0.6555554866790771),
 ('download', 0.6531580686569214),
 ('post', 0.6486697196960449),
 ('website', 0.6470268964767456),
 ('video', 0.6336097121238708),
 ('page', 0.6301894783973694)]

In [89]:
glove.most_similar('http')

[('htt', 0.7486403584480286),
 ('https', 0.6764466166496277),
 ('…', 0.6189000606536865),
 ('ht', 0.6136857271194458),
 ('<url>', 0.577113151550293),
 ('cont', 0.551079511642456),
 ('via', 0.5155504941940308),
 ('nhttp', 0.5139539241790771),
 ('.', 0.5073135495185852),
 ('/', 0.48933911323547363)]

In [90]:
glove.most_similar('<url>')

[('via', 0.7011606097221375),
 ('<hashtag>', 0.6496855020523071),
 ('<allcaps>', 0.6055645942687988),
 ('ht', 0.6022868752479553),
 ('>', 0.5933758020401001),
 ('video', 0.587887167930603),
 ('vía', 0.5816200971603394),
 ('http', 0.577113151550293),
 ('<user>', 0.5766465663909912),
 ('…', 0.5734578967094421)]

In [102]:
sum('http'in item for item in (test.values)[:,5])

70183

In [123]:
max(len(item.split(' ')) for item in (test.values)[:,5])

110