In [1]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from collections import defaultdict, UserDict
import mygrad as mg
from pathlib import Path
import pandas as pd
import re, string

from noggin import create_plot
%matplotlib notebook

In [None]:
unzipped_folder = "glove.twitter.27B/" # ENTER THE PATH TO THE UNZIPPED `glove.twitter.27B` HERE

# use glove2word2vec to convert GloVe vectors in text format into the word2vec text format:
if not Path('gensim_glove_vectors_200.txt').exists():
    
    # assumes you've downloaded and extracted the glove stuff
    glove2word2vec(glove_input_file= unzipped_folder + "glove.twitter.27B.200d.txt", 
               word2vec_output_file="gensim_glove_vectors_200.txt")

# read the word2vec txt to a gensim model using KeyedVectors
glove = KeyedVectors.load_word2vec_format("gensim_glove_vectors_200.txt", binary=False)

In [None]:
from mynn.layers.conv import conv
from mynn.layers.dense import dense
from mynn.activations.relu import relu
from mygrad.nnet.layers import max_pool
from mynn.activations.sigmoid import sigmoid
from mynn.initializers.glorot_normal import glorot_normal
from mynn.optimizers.adam import Adam

class Model:
    def __init__(self):
        """ Initializes model layers and weights. """
        # <COGINST>
        init_kwargs = {'gain': np.sqrt(2)}
        self.conv1 = conv(200, 250, 2, stride = 1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense1 = dense(250, 250, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense2 = dense(250,1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        # </COGINST>
    
    
    def __call__(self, x):
        """ Forward data through the network.
        
        This allows us to conveniently initialize a model `m` and then send data through it
        to be classified by calling `m(x)`.
        
        Parameters
        ----------
        x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, S)
            The data to forward through the network.
            
        Returns
        -------
        mygrad.Tensor, shape=(N, 1)
            The model outputs.
        
        Notes
        -----
        N = batch size
        D = embedding size
        S = sentence length
        """
        # <COGINST>
        # (N, D, S) with D = 200 and S = 77
        x = self.conv1(x) # conv output shape (N, F, S') with F = 250 and S' = 75
        x = relu(x)
        x = max_pool(x, (x.shape[-1],), 1) # global pool output shape (N, F, S') with F = 250, S' = 1
        x = x.reshape(x.shape[0], -1)  # (N, F, 1) -> (N, F)
        x = self.dense1(x) # (N, F) @ (F, D1) = (N, D1)
        x = relu(x) 
        x = self.dense2(x) # (N, D1) @ (D1, 1) = (N, 1)
        x = sigmoid(x)
        return x # output shape (N, 1)
        # </COGINST>
    
    
    @property
    def parameters(self, load = None):
        """ A convenience function for getting all the parameters of our model. """
        return self.conv1.parameters + self.dense1.parameters + self.dense2.parameters # <COGLINE>

In [None]:
def get_embedding(text):
    """ Returns the word embedding for a given word, reshaping the word embedding array. """
    out = []
    for word in text:
        if word not in glove:
            continue
        else:
            out.append(glove.get_vector(word))
    while len(out) < 80:
        out.append([0]*200)
    return np.array(out).T



In [None]:
punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
def strip_punc(text):
    return punc_regex.sub('', text).lower()

In [None]:
test = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ANSI', header=None)

In [None]:
sum(test[0]==2)

In [None]:
test[5][0]

In [None]:
sent1 = strip_punc(test[5][0]).split()
sent2 = strip_punc(test[5][1]).split()
print(sent2)

In [None]:
sent1 = get_embedding(sent1)
sent2 = get_embedding(sent2)
overall = np.concatenate((sent1, sent2), axis = 0).reshape(-1, len(sent1), sent1.shape[1])
overall.shape

In [None]:
sent1 = strip_punc(test[5][0])
sent2 = strip_punc(test[5][1])

In [None]:
check = [sent1, sent2]
check2 = batch_gen(check)
check2.shape

In [None]:
def batch_gen(text):
    out = []
    for each in text:
        each=strip_punc(each).split()
        out.append(get_embedding(each))
    return np.array(out)

In [None]:
sent1.shape

In [None]:
sum('http'in item for item in (test.values)[:,5])

In [None]:
max(len(item.split(' ')) for item in (test.values)[:,5])

In [None]:
def accuracy(pred, truth):
    """ Calculates the accuracy of the predicted sentiments.
    
    Parameters
    ----------
    pred: Union[numpy.ndarry, mygrad.Tensor]
        The prediction scores of sentiments of the tweets (as a float from 0 to 1)
    
    truth: numpy.ndarry
        The true tweet sentiment (0 or 1)
    
    Returns
    -------
    float
        The accuracy of the predictions
    """
    # <COGINST>
    if isinstance(pred, mg.Tensor):
        pred = pred.data
    return np.mean(np.round(pred) == truth)
    # </COGINST>

In [None]:
def binary_cross_entropy(y_pred, y_truth):
    """ Calculates the binary cross entropy loss for a given set of predictions.
    
    Parameters
    ----------
    y_pred: mg.Tensor, shape=
        The Tensor of class scores output from the model
    
    y_truth: mg.Tensor, shape=
        A constant Tensor or a NumPy array that contains the truth values for each prediction
    
    Returns
    -------
    mg.Tensor, shape=()
        A zero-dimensional tensor that is the loss
    """
    return -mg.mean(y_truth * mg.log(y_pred + 1e-08) + (1 - y_truth) * mg.log(1 - y_pred + 1e-08)) # <COGLINE>

In [None]:
slice_length = 4*len(test[0].values)//5
values = test.values
np.random.shuffle(values)
polarity = values[:,0]
polarity = polarity.astype('int16')
polarity[polarity==4] = 1
text = values[:,5]
pol_train = polarity[0:slice_length]
text_train = text[:slice_length]
pol_test = polarity[slice_length:]
text_test = text[slice_length:]

In [None]:
polarity

In [None]:
vals = test.values
vals[:,0]

In [None]:
model = Model()
optim = Adam(model.parameters, learning_rate = 1e-4)
plotter, fig, ax = create_plot(metrics=["loss", "accuracy"])

In [None]:
model(embeddings)

In [None]:
batch_size = 100

for epoch_cnt in range(2):
    idxs = np.arange(len(text_train))
    np.random.shuffle(idxs)
       
    for batch_cnt in range(len(text_train)//batch_size):
        # make slice object so indices can be referenced later
        batch_indices = slice(batch_cnt * batch_size, (batch_cnt + 1) * batch_size)
        batch = text_train[batch_indices]  # random batch of our training data
        
        # retrieve glove embeddings for batch
        # <COGINST>
        # initialize every value as small number which will be the placeholder for not found embeddings
        # </COGINST>
        embeddings = batch_gen(batch)
        
        # pass model through batch and perform gradient descent
        # <COGINST>
        pred = model(embeddings)
        truth = pol_train[batch_indices]
        
        loss = binary_cross_entropy(pred[:,0], truth)
        acc = accuracy(pred[:,0], truth)
        loss.backward()

        optim.step()
        loss.null_gradients()
        
        # </COGINST>
        
        # pass loss and accuracy to noggin for plotting
        plotter.set_train_batch({"loss" : loss.item(),
                                 "accuracy" : acc},
                                 batch_size=batch_size)
    
    
    # compute test statistics
    idxs = np.arange(len(text_test))
    for batch_cnt in range(0, len(text_test) // batch_size):
        batch_indices = slice(batch_cnt * batch_size, (batch_cnt + 1) * batch_size)
        batch = text_test[batch_indices]
        
        test_embeddings = batch_gen(batch)
        
        # perform forward pass and find accuracy but DO NOT backprop
        # <COGINST>
        pred = model(test_embeddings)
        truth = pol_test[batch_indices]
        acc = accuracy(pred[:,0], truth)
        # </COGINST>

        # log the test-accuracy in noggin
        plotter.set_test_batch({"accuracy" : acc},
                                 batch_size=batch_size)
   
    # plot the epoch-level train/test statistics
    plotter.set_train_epoch()
    plotter.set_test_epoch()

In [None]:
print(text_test[5], pol_test[5], model(batch_gen((text_test[5]))))

In [None]:
 def save_model(model, path):
        """Path to .npz file where model parameters will be saved."""
        with open(path, "wb") as f:
            np.savez(f, *(x.data for x in model.parameters))

In [None]:
save_model(model, 'sentiment model.npy')