In [1]:
import numpy as np
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

import pandas as pd

from IPython.display import Image
from IPython.core.display import HTML 

PATH = "/home/jma/Desktop/march_embedding/"

import keras
from keras import *
from keras.layers import Embedding, Input,concatenate, Flatten,dot, Dense,LSTM
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score,accuracy_score





import torch
import torch.utils.data as utils_data
from torch.autograd import Variable
from torch import optim, nn
from torch.utils.data import Dataset 
import torch.nn.functional as F
from torch.nn.init import xavier_normal, xavier_uniform
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


use_gpu=True


  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#common parameters
MAX_WORDS=10000 #keep top words
MAX_SEQ=200  #max length of sequence 
LSTM_SIZE = 128 #output size of LSTM
EMBEDDING_SIZE=128 #size of embedding 
DENSE_LAYER = 25 #fc1 output size
BATCH_SIZE=64 
EPOCHS=2


<h2>Load and prepare the data</h2>

In [3]:
train=pd.read_csv('/media/jma/DATA/amazon_review_full_csv/train.csv',header=0,nrows=250000,names=['rating','short','fullreview'])
y=train.rating.values

y=np.where(y>3,1,0)

tokenizer=Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train.fullreview)
X=tokenizer.texts_to_sequences(train.fullreview)
X=pad_sequences(X,maxlen=MAX_SEQ,padding="pre") #padding on the left, this seems to matter to pytorch 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)


<h2>KERAS</h2>

In [7]:

embedding_layer = Embedding(output_dim=EMBEDDING_SIZE, input_dim=MAX_WORDS,input_length=MAX_SEQ, mask_zero=True)

x = Input(shape=[MAX_SEQ])
embedding = embedding_layer(x)
lstm_out=LSTM(LSTM_SIZE)(embedding)
fc1=Dense(DENSE_LAYER,activation='relu')(lstm_out)
out=Dense(1,activation="sigmoid")(fc1)

model = Model(inputs=x, outputs=out)

print(model.summary())
model.compile(optimizer=Adam(), loss='binary_crossentropy')

model.fit(x=X_train, y=y_train, batch_size=BATCH_SIZE,epochs=EPOCHS, shuffle=True)







_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 128)          1280000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 25)                3225      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 26        
Total params: 1,414,835
Trainable params: 1,414,835
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7effc19add68>

In [8]:
pred=model.predict(X_test)

print(roc_auc_score(y_test, pred))
accuracy_score(y_test, np.where(pred>0.5,1,0))

0.9207524993337503


0.84844

<h2>Pytorch</h2>
<br/>
https://docs.google.com/presentation/d/17VUX7YXhMkJrqO5gNGh6EE5gzBpY-BF9IrfVKcFIb3A/edit#slide=id.g27e9c2914b_0_460
<br/>
<br/>
https://github.com/hunkim/PyTorchZeroToAll/blob/master/13_1_rnn_classification_basics.py
<br/>
<br/>
https://github.com/yuchenlin/lstm_sentence_classifier/blob/master/LSTM_sentence_classifier_minibatch.py

In [5]:
training_samples = utils_data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
data_loader = utils_data.DataLoader(training_samples, batch_size=64)


In [6]:
class Model(nn.Module):
    def __init__(self, batch_size, hidden_dim, num_embeddings,embedding_dim, dense_layer_dim, use_gpu):
        super(Model, self).__init__()
        
        
        self.hidden_dim = hidden_dim #(output size of the LSTM)
        self.batch_size = batch_size
        self.num_embeddings = num_embeddings
        self.embedding_dim=embedding_dim
        self.dense_layer_dim=dense_layer_dim
        self.use_gpu =use_gpu
        
                
        self.embeddings = nn.Embedding(num_embeddings=self.num_embeddings,embedding_dim=self.embedding_dim)
        
        #hidden size is the output size of the LSTM
        #batch_first – If True, then the input and output tensors are provided as (batch, seq, feature)
        self.lstm  =nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim ,batch_first=True )
        self.fc1   = nn.Linear(in_features=self.hidden_dim, out_features=self.dense_layer_dim)
        self.fc2   = nn.Linear(in_features=self.dense_layer_dim,out_features=1)
         

    def forward(self, x):
        batch_size = x.size(0)

        # Make a hidden
        hidden = self.init_hidden(batch_size)
        
        x=self.embeddings(x)
        
        lstm_out, hidden = self.lstm(x, hidden) #hidden is a tuple
   
        #I think this is correct and is getting all the batches, the last output and the full length of the hidden dim
        x  = F.relu(self.fc1(lstm_out[:,-1,:]))
        out = F.sigmoid(self.fc2(x))
       
        return out
    

        
        # Initialize hidden and cell states
    def init_hidden(self, batch_size):
        # the first is the hidden h
        # the second is the cell  c
        
        #both are size (num_layers * num_directions, batch, hidden_size)
        
        h=Variable(torch.zeros(1, batch_size, self.hidden_dim))
        c=Variable(torch.zeros(1, batch_size, self.hidden_dim))
        
        if self.use_gpu:
            h,c = h.cuda(), c.cuda() 
        
        return (h,c)
        


In [7]:
model=Model(batch_size=BATCH_SIZE, 
        hidden_dim= LSTM_SIZE, 
        num_embeddings = MAX_WORDS,
        embedding_dim=EMBEDDING_SIZE, 
        dense_layer_dim=DENSE_LAYER,
        use_gpu =use_gpu
        )

if use_gpu:
    model.cuda()
print(model)


optimizer = optim.Adam(model.parameters(),lr=1e-3,betas=(0.9, 0.999))
# create a loss function (mse)
loss = nn.BCELoss()

# run the main training loop
hold_loss=[]

for epoch in range(EPOCHS):
    cum_loss=0.
    for batch_idx, (data, target) in enumerate(data_loader):
        #EMBEDDING NEEDS INPUT THAT IS A LONG 
        tr_x, tr_y = Variable(data.long()), Variable(target.float().view(target.shape[0],))
        if use_gpu:
            tr_x, tr_y = tr_x.cuda(), tr_y.cuda() 
           
        # Reset gradient
        optimizer.zero_grad()

        # Forward pass
        fx = model(tr_x)
        
        output = loss(fx, tr_y) #loss for this batch
        cum_loss += output.data[0] #accumulate the loss
        
        # Backward 
        output.backward()
        
        # Update parameters based on backprop
        optimizer.step()
        
    hold_loss.append(cum_loss/len(data_loader))    
    #print(epoch+1, cum_loss) #


Model(
  (embeddings): Embedding(10000, 128)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=25, bias=True)
  (fc2): Linear(in_features=25, out_features=1, bias=True)
)


  "Please ensure they have the same size.".format(target.size(), input.size()))


In [8]:
test_samples = utils_data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
data_loader_test = utils_data.DataLoader(test_samples, batch_size=64)

for batch_idx, (data, target) in enumerate(data_loader_test):
    #EMBEDDING NEEDS INPUT THAT IS A LONG 
    tst_x, tst_y = Variable(data.long()), Variable(target.float().view(target.shape[0],))
    if use_gpu:
        tst_x, tst_y = tst_x.cuda(), tst_y.cuda() 
    
    pred=model(tst_x)
    
    if use_gpu:
        pred = pred.cpu().data.numpy()
        tst_y = tst_y.cpu().data.numpy().reshape(tst_y.shape[0],1)

    if batch_idx ==0:
        hold_pred=pred
        hold_actual=tst_y
    
    else:
        hold_pred =np.row_stack([hold_pred,pred])
        hold_actual =np.row_stack([hold_actual,tst_y])

print(roc_auc_score(hold_actual, hold_pred))
accuracy_score(hold_actual, np.where(hold_pred>0.5,1,0))

0.9171270064199741


0.84334

<h2>Pytorch again, this time use packing to mask zero like Keras</h2>
<br/>
<br/>
https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983
<br/>
https://github.com/hunkim/PyTorchZeroToAll/blob/master/13_4_pack_pad.py


In [9]:
#will need to have zero padded like before BUT right padded

train=pd.read_csv('/media/jma/DATA/amazon_review_full_csv/train.csv',header=0,nrows=250000,names=['rating','short','fullreview'])
y=train.rating.values

y=np.where(y>3,1,0)

tokenizer=Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train.fullreview)
X=tokenizer.texts_to_sequences(train.fullreview)
X=pad_sequences(X,maxlen=MAX_SEQ,padding="pre") #padding on the left, this seems to matter to pytorch 

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)


#errors in the sortseq happen with all zero rows
zero_x_train = ~np.all(X_train == 0, axis=1)
zero_x_test = ~np.all(X_test == 0, axis=1)

X_train=X_train[zero_x_train]
y_train=y_train[zero_x_train]

X_test=X_test[zero_x_test]
y_test=y_test[zero_x_test]

In [10]:
X_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [11]:
training_samples = utils_data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
data_loader = utils_data.DataLoader(training_samples, batch_size=64)

In [12]:
def sortseq(x,y):
  #input is (batch,seq_length (padded by zero))  
  
    input_lengths = torch.LongTensor([torch.max(x[i, :].data.nonzero()) + 1 for i in range(x.size()[0])])
    #A tuple of (sorted_tensor, sorted_indices) is returned, where the sorted_indices are the indices of the elements in the original input tensor.
    input_lengths, perm_idx = input_lengths.sort(0, descending=True)
    x = x[perm_idx]
    y= y[perm_idx]

    return (x,y,input_lengths.cpu().numpy())


#sort the batch by decreasing sequence length
#x=np.array([[1, 2,0], [0, 0, 0], [3, 4, 5]])
#x=Variable(torch.from_numpy(x)).long()
#sortseq(x)

#x = x[(x != 0).any()]

#input_lengths=torch.LongTensor([torch.max(x[i,:].data.nonzero()) +1 for i in range((x.size()[0]))]) #loops through each instance in x and records the index 
#input_lengths, perm_idx = input_lengths.sort(0, descending=True)

#g=x[perm_idx]
#g


#h=nn.Embedding(10,2)
#h=h(g)
#pack=pack_padded_sequence(h, input_lengths.cpu().numpy())
#pack

In [13]:
class Model(nn.Module):
    def __init__(self, batch_size, hidden_dim, num_embeddings,embedding_dim, dense_layer_dim, use_gpu):
        super(Model, self).__init__()
        
        
        self.hidden_dim = hidden_dim #(output size of the LSTM)
        self.batch_size = batch_size
        self.num_embeddings = num_embeddings
        self.embedding_dim=embedding_dim
        self.dense_layer_dim=dense_layer_dim
        self.use_gpu =use_gpu
        
       
                
        self.embeddings = nn.Embedding(num_embeddings=self.num_embeddings,embedding_dim=self.embedding_dim)
        
        #hidden size is the output size of the LSTM
        #batch_first – If True, then the input and output tensors are provided as (batch, seq, feature)
        self.lstm  =nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim ,batch_first=True )
        self.fc1   = nn.Linear(in_features=self.hidden_dim, out_features=self.dense_layer_dim)
        self.fc2   = nn.Linear(in_features=self.dense_layer_dim,out_features=1)
         

    def forward(self, x, input_lengths): #THIS IS NEW FOR MASKIN
        batch_size = x.size(0)

        # Make a hidden
        hidden = self.init_hidden(batch_size)
        
                
        x=self.embeddings(x)
        
        #THIS IS NEW FOR MASKING#####################################################
        #Pack the zero padded and sorted embedding
        x = pack_padded_sequence(x, input_lengths.data.cpu().numpy(), batch_first=True)
        
        #################################################################
        
        
        
        lstm_out, hidden = self.lstm(x, hidden) #hidden is a tuple
   

        #THIS IS NEW FOR MASKING#####################################################
        lstm_out, _ = pad_packed_sequence(lstm_out,batch_first=True)
        
        
        
        #################################################################
        
    
    
    
        #I think this is correct and is getting all the batches, the last output and the full length of the hidden dim
        x  = F.relu(self.fc1(lstm_out[:,-1,:]))
        out = F.sigmoid(self.fc2(x))
        
        return out
    

        
        # Initialize hidden and cell states
    def init_hidden(self, batch_size):
        # the first is the hidden h
        # the second is the cell  c
        
        #both are size (num_layers * num_directions, batch, hidden_size)
        
        h=Variable(torch.zeros(1, batch_size, self.hidden_dim))
        c=Variable(torch.zeros(1, batch_size, self.hidden_dim))
        
        if self.use_gpu:
            h,c = h.cuda(), c.cuda() 
        
        return (h,c)
        

In [14]:
model=Model(batch_size=BATCH_SIZE, 
        hidden_dim= LSTM_SIZE, 
        num_embeddings = MAX_WORDS,
        embedding_dim=EMBEDDING_SIZE, 
        dense_layer_dim=DENSE_LAYER,
        use_gpu =use_gpu
        )

if use_gpu:
    model.cuda()
print(model)


optimizer = optim.Adam(model.parameters(),lr=1e-3,betas=(0.9, 0.999))
# create a loss function (mse)
loss = nn.BCELoss()

# run the main training loop
hold_loss=[]

for epoch in range(EPOCHS):
    cum_loss=0.
    for batch_idx, (data, target) in enumerate(data_loader):
        #EMBEDDING NEEDS INPUT THAT IS A LONG 
        tr_x, tr_y = Variable(data.long()), Variable(target.float().view(target.shape[0],))
        
        #THIS IS NEW FOR MASKING#####################################################
        #pack
        tr_x, tr_y,input_lengths =sortseq(tr_x,tr_y)

        input_lengths=Variable(torch.from_numpy(input_lengths))
        ################################################################
        
        if use_gpu:
            tr_x, tr_y = tr_x.cuda(), tr_y.cuda() 
           
        # Reset gradient
        optimizer.zero_grad()
       
        # Forward pass
        fx = model(tr_x,input_lengths) #THIS IS NEW FOR MASKING
        
        output = loss(fx, tr_y) #loss for this batch
        cum_loss += output.data[0] #accumulate the loss
        
        # Backward 
        output.backward()
        
        # Update parameters based on backprop
        optimizer.step()
          
    hold_loss.append(cum_loss/len(data_loader))    
    #print(epoch+1, cum_loss) #

Model(
  (embeddings): Embedding(10000, 128)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=25, bias=True)
  (fc2): Linear(in_features=25, out_features=1, bias=True)
)


  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


In [15]:
#to score test do we need to pack / unpack?

In [16]:
test_samples = utils_data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
data_loader_test = utils_data.DataLoader(test_samples, batch_size=64)

for batch_idx, (data, target) in enumerate(data_loader_test):
    #EMBEDDING NEEDS INPUT THAT IS A LONG 
    tst_x, tst_y = Variable(data.long()), Variable(target.float().view(target.shape[0],))
    
    #THIS IS NEW FOR MASKING#####################################################
    #pack
    tst_x, tst_y,input_lengths =sortseq(tst_x,tst_y)

    input_lengths=Variable(torch.from_numpy(input_lengths))
    ################################################################
    
    
    if use_gpu:
        tst_x, tst_y = tst_x.cuda(), tst_y.cuda() 
    
    pred=model(tst_x, input_lengths)#THIS IS NEW FOR MASKING#####################################################
    
    if use_gpu:
        pred = pred.cpu().data.numpy()
        tst_y = tst_y.cpu().data.numpy().reshape(tst_y.shape[0],1)

    if batch_idx ==0:
        hold_pred=pred
        hold_actual=tst_y
    
    else:
        hold_pred =np.row_stack([hold_pred,pred])
        hold_actual =np.row_stack([hold_actual,tst_y])

print(roc_auc_score(hold_actual, hold_pred))
accuracy_score(hold_actual, np.where(hold_pred>0.5,1,0))

0.9147818740967897


0.83544

In [10]:
jeff=np.random.randint(1,20,(25,6,10))
jeff

array([[[18, 15, 16, ..., 12,  4, 15],
        [15, 19, 19, ..., 14,  2, 17],
        [ 1,  1, 12, ...,  6,  9,  7],
        [13,  2,  2, ...,  9,  4, 16],
        [ 7,  4, 16, ..., 16, 17,  5],
        [13,  1,  3, ..., 11, 17, 15]],

       [[ 8,  4, 14, ..., 19, 14,  7],
        [ 9,  8, 19, ...,  8, 19,  9],
        [ 3, 19, 15, ..., 13,  6, 15],
        [ 6,  6, 14, ..., 14, 12,  7],
        [18,  2,  1, ...,  7, 16, 14],
        [ 7, 15, 11, ..., 16,  9, 12]],

       [[12, 11,  9, ...,  6,  8,  5],
        [ 2, 19,  8, ...,  6,  4,  4],
        [19, 19,  6, ..., 13,  6, 12],
        [16,  5, 14, ...,  4, 17, 11],
        [11,  1,  7, ..., 14, 10,  2],
        [ 4,  5,  2, ...,  2, 14,  7]],

       ...,

       [[15,  6, 13, ..., 17,  7,  2],
        [17,  3, 18, ..., 18,  8, 15],
        [ 3, 19, 17, ..., 14,  7,  1],
        [ 8, 13, 17, ...,  1, 15,  5],
        [19,  4, 12, ..., 11,  2,  4],
        [13, 12,  9, ...,  2, 14, 15]],

       [[ 6, 15, 14, ...,  9,  6, 10],
    

In [5]:
import torch.nn.utils.rnn
pad_sequences(jeff,padding='post')

array([[1, 2, 3],
       [4, 5, 6],
       [5, 3, 0]], dtype=int32)