In [1]:
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
from base import BaseDataLoader
import pandas as pd
import gzip
import numpy as np
import re
import pickle
import visdom
import random
from gensim.parsing.preprocessing import remove_stopwords



In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable

In [3]:
from skorch import NeuralNetClassifier
import torch.optim as optim

In [4]:
class AmznDataLoader():
        def __init__(self):
            df = self.getDF('./data/reviews_Amazon_Instant_Video_5.json.gz')
    #         display(self.df)
    #         print(self.df.loc[0][['overall','reviewText']])
            df = df[['reviewText', 'overall']]
            df['reviewText'] = df['reviewText'].apply(lambda x : self.title_parsing(x))
            self.X = df['reviewText']
            self.y = df['overall']-1
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
            
        def title_parsing(self, title):  
            # remove stop words and tokenization 
            title = re.sub('[^a-zA-Z]', ' ', str(title))
            title = title.lower()
            #title = remove_stopwords(title)  ## remove stop words, corpus size 52680            
            title = title.split()
            title = [word for word in title if len(word) >1 ]
            return title
        
        
        def parse(self, path):
            g = gzip.open(path, 'rb')
            for l in g:
                yield eval(l)
            
        def getDF(self, path):
            i = 0
            df = {}
            for d in self.parse(path):
                df[i] = d
                i += 1
            return pd.DataFrame.from_dict(df, orient='index')        

        def buildCorpus(self):
            '''
            return a dictionary with 'word' and its index in corpus as key and value respectively
            '''
            word2idx = {}
            idx2word = [] ## alternatively use if.. condition
            idx = 0 
            for row in self.X:
                for word in row:
                    if word not in word2idx:
                        idx2word.append(word)                
                        word2idx[word] = len(idx2word) - 1
#             pickle.dump(word2idx, open('./data/corpusDict.txt', 'wb'))
            return word2idx   
        
    
        def indicesMatrix(self):
            '''
            return matrix such that review text transformed to index
            '''
            word2idx = self.buildCorpus()
#             word2idx = pickle.load(open('./data/corpusDict.txt', 'rb'))
            ## 53008 words in corpus
    
            corpusSize = len(word2idx) 
        
            maxNumberWords = sorted(len(x) for x in self.X)[-1]
            print ("maximum", maxNumberWords)

            index_matrix = np.zeros((self.X.shape[0], maxNumberWords))          
            for i, row in enumerate(self.X):
                for j, word in enumerate(row):
#                     try:
#                         index_matrix[i,j] = word2idx[word]
#                         words_found += 1
#                     except KeyError:
#                         index_matrix[i,j] = corpusSize     

                    index_matrix[i,j] = word2idx[word]
            return index_matrix
        

        #             self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)            



In [5]:
data_loader = AmznDataLoader()

In [6]:
data_loader.indicesMatrix()

maximum 2937


array([[0.000e+00, 1.000e+00, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.600e+01, 2.700e+01, 1.400e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.400e+01, 4.200e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [7.500e+01, 4.900e+01, 1.067e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.280e+02, 4.200e+01, 3.630e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.030e+02, 6.100e+01, 4.870e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [7]:
def GloveMatrix():
    '''
    return matrix contains embedding for word in corpus/review text
    Note that the word cannot be found in the glove returns ?? as embedding
    '''
    
    ## load glove files into dictionary 
#     words = []
#     idx = 0
#     word2idx = {}
    # vectors = bcolz.carray(np.zeros(1), rootdir=f'./data/glove_6B_50.dat', mode='w')
#     vectors = []
    glove = {}
    
    with open(f'./data/glove.6B.50d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()           
            word = line[0]
#            words.append(word)
#             word2idx[word] = idx
#            idx += 1
            vect = np.array(line[1:]).astype(np.float)
#             vectors.append(vect)
#     vectors = np.reshape(vectors, (400000, 50))
            glove.update({word:vect})
#     glove = {word2idx[w]: vectors[word2idx[w]] for w in words}  # alternatively generate weights_matrix directly

    target_vocab = data_loader.buildCorpus()
    #except
    #exceptKey = list(set(list(glove.keys())).difference(list(target_vocab.keys())))  ## 
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, 50))
    words_found = 0
    words_not_found = 0
    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            words_not_found += 1
            weights_matrix[i] = np.random.normal(scale=0.6, size=(50,))
    print(words_not_found)
    return  weights_matrix

In [8]:
weight_matrix = GloveMatrix()
print(weight_matrix[0])
print(weight_matrix.shape)

9777
[ 0.60348  -0.52096   0.40851  -0.37217   0.36978   0.61082  -1.3228
  0.24375  -0.5942   -0.35708   0.39942   0.031911 -1.0643   -0.52327
  0.71453   0.063384 -0.46383  -0.34641  -0.72445  -0.13714  -0.19179
  0.72225   0.6295   -0.8086   -0.037694 -2.0355    0.10566  -0.038591
 -0.23201  -0.29627   3.3215    0.032443  0.085368 -0.40771   0.45341
 -0.099674  0.44704   0.5422    0.18185   0.17504  -0.33833   0.31697
 -0.025268  0.095795 -0.25071  -0.47564  -1.0407   -0.15138  -0.22057
 -0.59633 ]
(52982, 50)


In [9]:
pickle.dump(weight_matrix,  open('./data/GloveMatrix.npy', 'wb'))

In [10]:
data_loader.indicesMatrix().shape

maximum 2937


(37126, 2937)

In [17]:
# todo reset parameters for linear layer
class FirstModel(nn.Module):
    def __init__(self, max_length):
        super(FirstModel, self).__init__()
        weights_matrix = GloveMatrix()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix), freeze=False)
#        self.embedding = nn.Embedding(weights_matrix.shape[0],weights_matrix.shape[1])
#         self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        self.drop1 = nn.Dropout(p=0.5)
        self.conv1 = nn.Conv1d(in_channels=50, out_channels=200, kernel_size=4, padding=1).double()
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=200, kernel_size=5, padding=1).double()
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.drop2 = nn.Dropout(p=0.15)
        self.hx = Variable(torch.randn(1, 256, 100))
        self.rnn = nn.GRU(input_size=400, hidden_size=100, num_layers=1, batch_first=True)  
#        self.fc1 =nn.Linear(in_features=1501*100, out_features = 400)
        self.fc1 =nn.Linear(in_features=1534*100, out_features = 400)
        self.drop3 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(in_features=400, out_features=10)
        
    def forward(self, x):
        print("input tensor", x.size())
        x = self.embedding(x)
        print("after embedding", x.size())
        x = self.drop1(x)
#        print(x)

        x = x.view(256,50,-1)  # input(N,C_in,L) to conv1d
        x1 = F.relu(self.conv1(x))
        print("x1 shape", x1.size())
        x2 = F.relu(self.conv2(x))
        print("x2 shape ", x2.size())

        x1 = self.maxpool(x1)
        print("x1 shape", x1.size())        
        x2 = self.maxpool(x2)
        print("x2 shape", x2.size())        
        x = torch.cat((x1,x2), 2)
        x = self.drop2(x)
        #x = pack_padded_sequence(x, lengths, batch_first=True)
#        x = x.view(256,1501,400)
        x = x.view(256, -1, 400).float()#### change dimensionality
        output, hidden = self.rnn(x, self.hx)
#         for i in range(10):
#             self.hx = self.rnn(x[i], self.hx)
#             output.append(self.hx)
        x = output.contiguous().view(256,-1)
        print(x.size())
        x = F.relu(self.fc1(x))
        x = self.drop3(x)
        x = self.fc2(x)
        return x
    
    
#     def create_emb_layer(self, weights_matrix, non_trainable=True):
#         num_embeddings, embedding_dim = weights_matrix.shape
#         emb_layer = nn.Embedding(num_embeddings, embedding_dim)
#         emb_layer.load_state_dict({'weight': weights_matrix})
#         if non_trainable:
#             emb_layer.weight.requires_grad = False

#         return emb_layer, num_embeddings, embedding_dim
    

In [18]:
net = NeuralNetClassifier(FirstModel,
                          module__max_length=data_loader.indicesMatrix().shape[1],
                          max_epochs=5, 
                          lr=0.001, 
                          criterion = nn.CrossEntropyLoss, 
                          optimizer=optim.SGD,
                          optimizer__param_groups=[('momentum', 0.9)],
                          batch_size = 256
                          )

maximum 2937


In [None]:
inputs = torch.from_numpy(data_loader.indicesMatrix()).long()
labels = torch.tensor(data_loader.y)
net.fit(inputs, labels)

maximum 2937
9777
input tensor torch.Size([256, 2937])
after embedding torch.Size([256, 2937, 50])
x1 shape torch.Size([256, 200, 2936])
x2 shape  torch.Size([256, 200, 2935])
x1 shape torch.Size([256, 200, 1468])
x2 shape torch.Size([256, 200, 1467])


In [None]:
m = data_loader.indicesMatrix()

In [None]:
y = torch.tensor(data_loader.y.values)

In [None]:
y

In [None]:
w = torch.tensor(weights_matrix).double()

In [None]:
w

In [None]:
Variable(w.double())