In [1]:
from collections import Counter

from pyparsing import WordStart
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import requests
from io import StringIO
import math
import gensim.downloader as api
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import Word2Vec

class PreprocessData:
    def __init__(self,path,lang='english'): 
        self.dataset=pd.read_csv(path, sep=",")
        self.dataset=self.dataset[["content","score"]]
        self.stopwords=stopwords.words(lang)
        self.preprocess()
    
    @staticmethod
    def remove_punctuation(text):
        '''a function for removing punctuation'''
        # replacing the punctuations with no space, 
        # which in effect deletes the punctuation marks 
        translator = str.maketrans('', '', string.punctuation)
        # return the text stripped of punctuation marks
        return text.translate(translator)

    #A function to remove the stopwords
    def remove_stopwords(self,text):
        text = [word.lower() for word in text.split() if word.lower() not in self.stopwords]
        # joining the list of words with space separator
        return " ".join(text)

    def preprocess(self):
        self.dataset.iloc[:,0] = self.dataset.iloc[:,0].apply(self.remove_punctuation)
        self.dataset.iloc[:,0] = self.dataset.iloc[:,0].apply(self.remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
path="/content/reviews.csv"
dataset=PreprocessData(path)

In [3]:
dataset.dataset.head()

Unnamed: 0,content,score
0,cannot open app anymore,1
1,begging refund app month nobody replying,1
2,costly premium version approx indian rupees 91...,1
3,used keep organized 2020 updates made mess thi...,1
4,dan birthday oct 28,1


In [4]:
# wv = api.load('glove-wiki-gigaword-50')

In [5]:
# from gensim.scripts.glove2word2vec import glove2word2vec

In [6]:
# import gensim.models

# # This should take approximately 3 minutes
# model = gensim.models.Word2Vec(sentences=sentences,min_count=1)

In [7]:
# print(list(model.wv.vocab))

In [112]:
class WordRepresentation:
    def __init__(self,dataset):
        self.dataset=dataset
        self.glove={}
        self.word2vec={}
        self.compute_glove()
        self.compute_word2vec()
        self.w2v_final_data=[]
        self.glo_final_data=[]
        self.represent_all_sentences()
    
    #Compute Glove
    def compute_glove(self):
      wv = api.load('glove-wiki-gigaword-50')
      #self.glove["nan"]=(np.zeros(50,),0)
      #word_index=1
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        word_list=document.split(" ")
        for word in word_list:
          if word in wv.vocab:
            self.glove[word]=wv.word_vec(word)
            # self.glove[word]=(wv.word_vec(word),word_index)
            # word_index+=1
    
    @staticmethod
    def tokenize(data): 
      word = []
      for i in range(len(data)):
        new_doc = data[i].split()
        word.append(new_doc)
      return word
    
    def compute_word2vec(self):
      token = WordRepresentation.tokenize(dataset.dataset['content'])
      model = Word2Vec(sentences=token, workers = 1, size = 50, min_count = 1, window = 3)
      # self.word2vec["nan"]=(np.zeros(50,),0)
      # word_index=1
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        word_list=document.split(" ")
        for word in word_list:
          if word in model.wv.vocab:
            self.word2vec[word]=model.wv.word_vec(word)
            # self.word2vec[word]=(model.wv.word_vec(word),word_index)
            # word_index+=1
    
    def sentence_representation(self,sentence,method="word2vec",sentence_length=20,vector_length=50):
      the_sentence=sentence.split(' ')
      if method=="word2vec"or method=="glove":
        if method=="word2vec":
          words_dict=self.word2vec
        else:
          words_dict=self.glove
        
        matrix=np.zeros((sentence_length,vector_length))
        i =0
        for word in the_sentence:
          if word in words_dict:
            matrix[i]=words_dict[word]
            i+=1
          if i>=sentence_length:
            break
        return matrix
    
    # def represent_all_sentences(self,sentence_length=20,vector_length=50):
    #   for i in range(len(self.dataset)):
    #     document=self.dataset.iloc[i,0]
    #     y=self.dataset.iloc[i,1]
    #     x_w2v=self.sentence_representation(document,sentence_length=sentence_length,vector_length=vector_length)
    #     x_glo=self.sentence_representation(document,method="glove",sentence_length=sentence_length,vector_length=vector_length)
    #     self.w2v_final_data.append((x_w2v,y))
    #     self.glo_final_data.append((x_glo,y))
        
    def represent_all_sentences(self,sentence_length=20,vector_length=50):
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        y=torch.tensor([self.dataset.iloc[i,1]])
        x_w2v=self.sentence_representation(document,sentence_length=sentence_length,vector_length=vector_length)
        y_w2v=torch.tensor(x_w2v.reshape(1,-1))
        y_w2v=y_w2v.float()
        x_glo=self.sentence_representation(document,method="glove",sentence_length=sentence_length,vector_length=vector_length)
        y_glo=torch.tensor(x_glo.reshape(1,-1))
        y_glo=y_glo.float()
        self.w2v_final_data.append((y_w2v,y))
        self.glo_final_data.append((y_glo,y))
        
        


In [240]:
torch.tensor([2])

tensor([2])

In [194]:
new_data=WordRepresentation(dataset.dataset.iloc[:1500])

In [115]:
# new_data.represent_all_sentences()

In [244]:
new_data.glove["open"]

array([-0.062761,  0.81904 , -0.067769,  1.0728  , -0.48884 , -0.53659 ,
       -0.39512 ,  0.29684 , -0.24994 , -0.64616 , -0.38994 , -1.0181  ,
       -0.74659 ,  0.5638  ,  0.62756 ,  0.84303 ,  0.1374  , -0.54534 ,
       -0.16795 , -0.43048 ,  0.18814 ,  0.16537 , -0.1674  ,  0.59948 ,
       -0.16904 , -1.1376  ,  0.36256 ,  0.19763 ,  0.010075, -0.3838  ,
        3.5756  ,  0.045535, -0.52057 ,  0.32947 , -0.36942 , -0.37348 ,
        0.31756 ,  0.81132 , -0.044679, -0.79792 ,  0.22949 , -0.73993 ,
        0.90041 ,  0.46883 ,  0.4512  ,  0.6644  ,  0.24931 , -0.88062 ,
       -0.042584, -0.21827 ], dtype=float32)

In [243]:
new_data.glove["cannot"]

array([ 0.59751 , -0.43505 ,  0.50205 , -0.36017 ,  0.57239 ,  0.029651,
        0.45819 ,  0.49277 , -0.085655,  0.014515,  0.62682 ,  0.68755 ,
        0.019485, -0.24884 ,  0.80687 ,  1.2645  ,  0.57857 , -0.34103 ,
        0.68662 , -0.84583 , -0.2885  , -0.175   ,  0.53579 ,  0.33212 ,
        0.59338 , -1.8564  , -0.39061 , -0.34753 ,  0.75789 , -0.48621 ,
        3.1397  ,  0.37391 , -0.93761 , -0.83119 , -0.26351 , -0.28465 ,
        0.42143 , -0.13558 , -0.21163 , -0.22854 , -0.16832 , -0.39381 ,
        0.62153 ,  0.75879 , -0.16856 ,  0.022767, -0.50844 ,  0.33201 ,
       -0.02658 ,  0.068742], dtype=float32)

In [245]:
s=new_data.dataset.iloc[0,0]

In [246]:
s

'cannot open app anymore'

In [248]:
# new_data.sentence_representation(s)

In [241]:
new_data.glo_final_data[0]

(tensor([[ 0.5975, -0.4351,  0.5020, -0.3602,  0.5724,  0.0297,  0.4582,  0.4928,
          -0.0857,  0.0145,  0.6268,  0.6876,  0.0195, -0.2488,  0.8069,  1.2645,
           0.5786, -0.3410,  0.6866, -0.8458, -0.2885, -0.1750,  0.5358,  0.3321,
           0.5934, -1.8564, -0.3906, -0.3475,  0.7579, -0.4862,  3.1397,  0.3739,
          -0.9376, -0.8312, -0.2635, -0.2846,  0.4214, -0.1356, -0.2116, -0.2285,
          -0.1683, -0.3938,  0.6215,  0.7588, -0.1686,  0.0228, -0.5084,  0.3320,
          -0.0266,  0.0687, -0.0628,  0.8190, -0.0678,  1.0728, -0.4888, -0.5366,
          -0.3951,  0.2968, -0.2499, -0.6462, -0.3899, -1.0181, -0.7466,  0.5638,
           0.6276,  0.8430,  0.1374, -0.5453, -0.1680, -0.4305,  0.1881,  0.1654,
          -0.1674,  0.5995, -0.1690, -1.1376,  0.3626,  0.1976,  0.0101, -0.3838,
           3.5756,  0.0455, -0.5206,  0.3295, -0.3694, -0.3735,  0.3176,  0.8113,
          -0.0447, -0.7979,  0.2295, -0.7399,  0.9004,  0.4688,  0.4512,  0.6644,
           0.249

In [117]:
# new_data.glove

In [118]:
# new_data.word2vec

In [195]:
len(new_data.word2vec)

4022

In [196]:
len(new_data.glove)

3396

In [197]:
input_size=len(new_data.glove)*20

In [198]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [227]:


class FcNeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(FcNeuralNet, self).__init__()
        self.fc1= nn.Linear(input_dim, hidden_dim)
        self.fc2= nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
      """
      The forward pass of the fully connected layer
      """
      out = F.relu(self.fc1(x))
      out = self.fc2(out)
      out=F.relu(out)
      return out

In [228]:
x=new_data.glo_final_data[3]

In [229]:
model2=FcNeuralNet(20*50,20,5)

In [230]:
# model2.forward(x[0])

In [231]:
p=0.8
size=int(0.8*len(new_data.glo_final_data))
data_train=new_data.glo_final_data[:size]

In [232]:
# data_train[0]

In [233]:
data_test=new_data.glo_final_data[size:]

In [234]:
# data_test[0][0]

In [235]:
num_epochs=30

In [236]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model2.parameters(), lr=1e-4,momentum=0.9)

In [237]:
# Train the model
for epoch in range(num_epochs):
  for (sentences, labels) in data_train:

    # Move tensors to the configured device
    # sentences = sentences.to(device)
    # labels = labels.to(device)
        
    # Forward pass
    outputs = model2(sentences)
    loss = criterion(outputs, labels-1)    
    # Backprpagation and optimization
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
        
    print('| Epoch {:3d}| Step {:3d}| Loss: {:.4f}'.format(epoch+1, num_epochs,loss.item()))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| Epoch  26| Step  30| Loss: 0.7296
| Epoch  26| Step  30| Loss: 2.1203
| Epoch  26| Step  30| Loss: 2.0658
| Epoch  26| Step  30| Loss: 1.9524
| Epoch  26| Step  30| Loss: 1.9169
| Epoch  26| Step  30| Loss: 1.5265
| Epoch  26| Step  30| Loss: 1.2590
| Epoch  26| Step  30| Loss: 0.6975
| Epoch  26| Step  30| Loss: 0.2380
| Epoch  26| Step  30| Loss: 1.9634
| Epoch  26| Step  30| Loss: 1.8325
| Epoch  26| Step  30| Loss: 1.5802
| Epoch  26| Step  30| Loss: 1.0591
| Epoch  26| Step  30| Loss: 2.0860
| Epoch  26| Step  30| Loss: 2.0415
| Epoch  26| Step  30| Loss: 1.6397
| Epoch  26| Step  30| Loss: 1.9437
| Epoch  26| Step  30| Loss: 2.2702
| Epoch  26| Step  30| Loss: 0.6586
| Epoch  26| Step  30| Loss: 2.1431
| Epoch  26| Step  30| Loss: 1.9067
| Epoch  26| Step  30| Loss: 1.0613
| Epoch  26| Step  30| Loss: 0.8991
| Epoch  26| Step  30| Loss: 0.7496
| Epoch  26| Step  30| Loss: 0.5058
| Epoch  26| Step  30| Loss: 0.8937

In [239]:
with torch.no_grad():
    correct = 0
    total = 0
    for (sentences, labels) in data_test:
        outputs = model2(sentences)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted+1 == labels).sum().item()
        print(predicted,labels)
    
    print("="* 20)
    print('Accuracy test sentences: {} %'.format(100 * correct / total))
    print("="* 20)

tensor([3]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([1]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([3]) tensor([2])
tensor([1]) tensor([2])
tensor([1]) tensor([2])
tensor([3]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([1]) tensor([2])
tensor([1]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([0]) tensor([2])
tensor([4]) tens