In [93]:
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
from base import BaseDataLoader
import pandas as pd
import gzip
import numpy as np
import re
import pickle
import visdom
import random
from gensim.parsing.preprocessing import remove_stopwords

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable

In [3]:
from skorch import NeuralNetClassifier
import torch.optim as optim

In [199]:
class AmznDataLoader():
        def __init__(self):
            df = self.getDF('./data/reviews_Amazon_Instant_Video_5.json.gz')
    #         display(self.df)
    #         print(self.df.loc[0][['overall','reviewText']])
            df = df[['reviewText', 'overall']]
            df['reviewText'] = df['reviewText'].apply(lambda x : self.title_parsing(x))
            self.X = df['reviewText']
            self.y = df['overall']-1
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
            
        def title_parsing(self, title):  
            # remove stop words and tokenization 
            title = re.sub('[^a-zA-Z]', ' ', str(title))
            title = title.lower()
#            title = remove_stopwords(title)  ## remove stop words, corpus size 52680            
            title = title.split()
#            title = [word for word in title if len(word) >1 ]
            return title
        
        
        def parse(self, path):
            g = gzip.open(path, 'rb')
            for l in g:
                yield eval(l)
            
        def getDF(self, path):
            i = 0
            df = {}
            for d in self.parse(path):
                df[i] = d
                i += 1
            return pd.DataFrame.from_dict(df, orient='index')        

        def buildCorpus(self):
            '''
            return a dictionary with 'word' and its index in corpus as key and value respectively
            '''
            word2idx = {}
            idx2word = [] ## alternatively use if.. condition
            idx = 0 
            for row in self.X:
                for word in row:
                    if word not in word2idx:
                        idx2word.append(word)                
                        word2idx[word] = len(idx2word) - 1
#             pickle.dump(word2idx, open('./data/corpusDict.txt', 'wb'))
            return word2idx   
        
    
        def indicesMatrix(self):
            '''
            return matrix such that review text transformed to index
            '''
            word2idx = self.buildCorpus()
#             word2idx = pickle.load(open('./data/corpusDict.txt', 'rb'))
            ## 53008 words in corpus
    
            corpusSize = len(word2idx) 
        
            maxNumberWords = sorted(len(x) for x in self.X)[-1]
            print ("maximum", maxNumberWords)

            index_matrix = np.zeros((self.X.shape[0], maxNumberWords))          
            for i, row in enumerate(self.X):
                for j, word in enumerate(row):
#                     try:
#                         index_matrix[i,j] = word2idx[word]
#                         words_found += 1
#                     except KeyError:
#                         index_matrix[i,j] = corpusSize     

                    index_matrix[i,j] = word2idx[word]
            return index_matrix
        

        #             self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)            



In [200]:
data_loader = AmznDataLoader()

In [201]:
data_loader.indicesMatrix()

maximum 3068


array([[0.00e+00, 1.00e+00, 2.00e+00, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 2.80e+01, 2.90e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [1.50e+01, 4.50e+01, 1.70e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       ...,
       [0.00e+00, 1.09e+03, 7.90e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [1.32e+02, 4.50e+01, 3.67e+02, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 2.07e+02, 6.50e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00]])

In [202]:
def GloveMatrix():
    '''
    return matrix contains embedding for word in corpus/review text
    Note that the word cannot be found in the glove returns ?? as embedding
    '''
    
    ## load glove files into dictionary 
#     words = []
#     idx = 0
#     word2idx = {}
    # vectors = bcolz.carray(np.zeros(1), rootdir=f'./data/glove_6B_50.dat', mode='w')
#     vectors = []
    glove = {}
    
    with open(f'./data/glove.6B.50d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()           
            word = line[0]
#            words.append(word)
#             word2idx[word] = idx
#            idx += 1
            vect = np.array(line[1:]).astype(np.float)
#             vectors.append(vect)
#     vectors = np.reshape(vectors, (400000, 50))
            glove.update({word:vect})
#     glove = {word2idx[w]: vectors[word2idx[w]] for w in words}  # alternatively generate weights_matrix directly

    target_vocab = data_loader.buildCorpus()
    exceptKey = list(set(list(glove.keys())).difference(list(target_vocab.keys())))  ## 
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, 50))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = glove[random.choice(exceptKey)]
    
    return  weights_matrix

In [203]:
print(glove[random.choice(list(glove.keys()))])
print(len(glove))

[ 0.15761  -0.68234   0.92048  -0.3578    0.39582  -0.13965  -0.96575
  1.0044   -0.56677  -0.11254  -0.29893   0.57924  -0.56421  -0.98504
 -1.0391   -0.17326   0.60354   0.26138   0.24561  -0.22561  -0.75672
 -0.62936   0.10151   0.08579  -0.57522   0.55916   0.89052   1.0052
  0.11944  -0.6231   -0.77928  -0.23602   0.89458   0.59554   1.1331
  0.099033 -1.0596   -0.29685   1.1037   -0.74916   0.034701  0.21993
 -1.4328   -1.0163    0.071944 -1.237     0.96203   0.66187   0.46418
  0.87093 ]
400000


In [204]:
m = pickle.load(open('./data/GloveMatrix.npy', 'rb'))

In [205]:
weights_matrix = GloveMatrix()


In [206]:
torch.from_numpy(weights_matrix)

tensor([[ 0.1189,  0.1525, -0.0821,  ..., -0.5751, -0.2667,  0.9212],
        [ 0.6035, -0.5210,  0.4085,  ..., -0.1514, -0.2206, -0.5963],
        [-0.3240,  0.3079,  0.2859,  ..., -0.0910, -0.1337,  0.4730],
        ...,
        [ 0.1415, -0.3281, -0.1360,  ..., -0.2500, -0.3862, -0.8667],
        [ 0.1372, -0.2024, -0.8186,  ...,  0.2502, -0.2155, -0.4198],
        [ 0.0879, -0.2951, -0.0403,  ...,  0.4854, -0.0851, -0.4059]],
       dtype=torch.float64)

In [170]:
data_loader.indicesMatrix().shape

(37126, 1501)

In [286]:
# todo reset parameters for linear layer
class FirstModel(nn.Module):
    def __init__(self):
        super(FirstModel, self).__init__()
        weights_matrix = GloveMatrix()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix).double())
#        self.embedding = nn.Embedding(weights_matrix.shape[0],weights_matrix.shape[1])
#         self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        self.drop1 = nn.Dropout(p=0.5)
        self.conv1 = nn.Conv1d(in_channels=3068, out_channels=200, kernel_size=4, padding=2).double()
        self.conv2 = nn.Conv1d(in_channels=3068, out_channels=200, kernel_size=5, padding=2).double()
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.drop2 = nn.Dropout(p=0.15)
        self.hx = Variable(torch.randn(1, 256, 100))
        self.rnn = nn.GRU(input_size=400, hidden_size=100, num_layers=1, batch_first=True)  
#        self.fc1 =nn.Linear(in_features=1501*100, out_features = 400)
        self.fc1 =nn.Linear(in_features=1534*100, out_features = 400)
        self.drop3 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(in_features=400, out_features=10)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.drop1(x)
#        print(x)
        x1 = F.relu(self.conv1(x))
#       print(x1)
        x2 = F.relu(self.conv2(x))
        x1 = self.maxpool(x1)
        x2 = self.maxpool(x2)
        x = torch.cat((x1,x2), 1)
        x = self.drop2(x)
        #x = pack_padded_sequence(x, lengths, batch_first=True)
#        x = x.view(256,1501.400)
        x = x.view(256, -1, 400).float()   #### change dimensionality
        output, hidden = self.rnn(x, self.hx)
#         for i in range(10):
#             self.hx = self.rnn(x[i], self.hx)
#             output.append(self.hx)
        x = output.contiguous().view(256,-1)
        x = F.relu(self.fc1(x))
        x = self.drop3(x)
        x = self.fc2(x)
        
        return x
    
    
#     def create_emb_layer(self, weights_matrix, non_trainable=True):
#         num_embeddings, embedding_dim = weights_matrix.shape
#         emb_layer = nn.Embedding(num_embeddings, embedding_dim)
#         emb_layer.load_state_dict({'weight': weights_matrix})
#         if non_trainable:
#             emb_layer.weight.requires_grad = False

#         return emb_layer, num_embeddings, embedding_dim
    

In [287]:
net = NeuralNetClassifier(FirstModel, 
                          max_epochs=5, 
                          lr=0.001, 
                          criterion = nn.CrossEntropyLoss, 
                          optimizer=optim.SGD,
                          optimizer__param_groups=[('momentum', 0.9)],
                          batch_size = 256
                          )

In [288]:
inputs = torch.from_numpy(data_loader.indicesMatrix()).long()
labels = torch.tensor(data_loader.y.values)
net.fit(inputs, labels)

maximum 3068


RuntimeError: size mismatch, m1: [256 x 2500], m2: [153400 x 400] at /opt/conda/conda-bld/pytorch_1550848541656/work/aten/src/TH/generic/THTensorMath.cpp:940

In [194]:
m = data_loader.indicesMatrix()

maximum 1501


In [224]:
y = torch.tensor(data_loader.y.values)

In [225]:
y

tensor([1., 4., 0.,  ..., 2., 3., 2.], dtype=torch.float64)

In [277]:
w = torch.tensor(weights_matrix).double()

In [278]:
w

tensor([[ 0.1189,  0.1525, -0.0821,  ..., -0.5751, -0.2667,  0.9212],
        [ 0.6035, -0.5210,  0.4085,  ..., -0.1514, -0.2206, -0.5963],
        [-0.3240,  0.3079,  0.2859,  ..., -0.0910, -0.1337,  0.4730],
        ...,
        [ 0.1415, -0.3281, -0.1360,  ..., -0.2500, -0.3862, -0.8667],
        [ 0.1372, -0.2024, -0.8186,  ...,  0.2502, -0.2155, -0.4198],
        [ 0.0879, -0.2951, -0.0403,  ...,  0.4854, -0.0851, -0.4059]],
       dtype=torch.float64)

In [268]:
Variable(w.double())

tensor([[ 0.1189,  0.1525, -0.0821,  ..., -0.5751, -0.2667,  0.9212],
        [ 0.6035, -0.5210,  0.4085,  ..., -0.1514, -0.2206, -0.5963],
        [-0.3240,  0.3079,  0.2859,  ..., -0.0910, -0.1337,  0.4730],
        ...,
        [ 0.1415, -0.3281, -0.1360,  ..., -0.2500, -0.3862, -0.8667],
        [ 0.1372, -0.2024, -0.8186,  ...,  0.2502, -0.2155, -0.4198],
        [ 0.0879, -0.2951, -0.0403,  ...,  0.4854, -0.0851, -0.4059]],
       dtype=torch.float64)