In [2]:
from collections import Counter

from pyparsing import WordStart
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import requests
from io import StringIO
import math
import gensim.downloader as api
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import Word2Vec

class PreprocessData:
    def __init__(self,path,lang='english'): 
        self.dataset=pd.read_csv(path, sep=",")
        self.dataset=self.dataset[["content","score"]]
        self.stopwords=stopwords.words(lang)
        self.preprocess()
    
    @staticmethod
    def remove_punctuation(text):
        '''a function for removing punctuation'''
        # replacing the punctuations with no space, 
        # which in effect deletes the punctuation marks 
        translator = str.maketrans('', '', string.punctuation)
        # return the text stripped of punctuation marks
        return text.translate(translator)

    #A function to remove the stopwords
    def remove_stopwords(self,text):
        text = [word.lower() for word in text.split() if word.lower() not in self.stopwords]
        # joining the list of words with space separator
        return " ".join(text)

    def preprocess(self):
        self.dataset.iloc[:,0] = self.dataset.iloc[:,0].apply(self.remove_punctuation)
        self.dataset.iloc[:,0] = self.dataset.iloc[:,0].apply(self.remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
path="/content/reviews.csv"
dataset=PreprocessData(path)

In [4]:
dataset.dataset.head()

Unnamed: 0,content,score
0,cannot open app anymore,1
1,begging refund app month nobody replying,1
2,costly premium version approx indian rupees 91...,1
3,used keep organized 2020 updates made mess thi...,1
4,dan birthday oct 28,1


In [5]:
# wv = api.load('glove-wiki-gigaword-50')



In [20]:
# from gensim.scripts.glove2word2vec import glove2word2vec

In [89]:
# import gensim.models

# # This should take approximately 3 minutes
# model = gensim.models.Word2Vec(sentences=sentences,min_count=1)

In [90]:
# print(list(model.wv.vocab))

In [9]:
class WordRepresentation:
    def __init__(self,dataset):
        self.dataset=dataset
        self.glove={}
        self.word2vec={}
        self.compute_glove()
        self.compute_word2vec()
        self.w2v_final_data=[]
        self.glo_final_data=[]
        self.represent_all_sentences()
    
    #Compute Glove
    def compute_glove(self):
      wv = api.load('glove-wiki-gigaword-50')
      #self.glove["nan"]=(np.zeros(50,),0)
      #word_index=1
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        word_list=document.split(" ")
        for word in word_list:
          if word in wv.vocab:
            self.glove[word]=wv.word_vec(word)
            # self.glove[word]=(wv.word_vec(word),word_index)
            # word_index+=1
    
    @staticmethod
    def tokenize(data): 
      word = []
      for i in range(len(data)):
        new_doc = data[i].split()
        word.append(new_doc)
      return word
    
    def compute_word2vec(self):
      token = WordRepresentation.tokenize(dataset.dataset['content'])
      model = Word2Vec(sentences=token, workers = 1, size = 50, min_count = 1, window = 3)
      # self.word2vec["nan"]=(np.zeros(50,),0)
      # word_index=1
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        word_list=document.split(" ")
        for word in word_list:
          if word in model.wv.vocab:
            self.word2vec[word]=model.wv.word_vec(word)
            # self.word2vec[word]=(model.wv.word_vec(word),word_index)
            # word_index+=1
    
    def sentence_representation(self,sentence,method="word2vec",sentence_length=20,vector_length=50):
      the_sentence=sentence.split(' ')
      if method=="word2vec"or method=="glove":
        if method=="word2vec":
          words_dict=self.word2vec
        else:
          words_dict=self.glove
        
        matrix=np.zeros((sentence_length,vector_length))
        i =0
        for word in the_sentence:
          if word in words_dict:
            matrix[i]=words_dict[word]
            i+=1
          if i>=sentence_length:
            break
        return matrix
    
    def represent_all_sentences(self,sentence_length=20,vector_length=50):
      for i in range(len(self.dataset)):
        document=self.dataset.iloc[i,0]
        y=self.dataset.iloc[i,1]
        x_w2v=self.sentence_representation(document,sentence_length=sentence_length,vector_length=vector_length)
        x_glo=self.sentence_representation(document,method="glove",sentence_length=sentence_length,vector_length=vector_length)
        self.w2v_final_data.append((x_w2v,y))
        self.glo_final_data.append((x_glo,y))
        
        


In [10]:
new_data=WordRepresentation(dataset.dataset.iloc[:20])

In [11]:
# new_data.represent_all_sentences()

Ici :  240 cannot
Voila
Ici :  240 open
Voila
Ici :  240 app
Voila
Ici :  240 anymore
Voila
Ici :  231 cannot
Voila
Ici :  231 open
Voila
Ici :  231 app
Voila
Ici :  231 anymore
Voila
Ici :  240 begging
Voila
Ici :  240 refund
Voila
Ici :  240 app
Voila
Ici :  240 month
Voila
Ici :  240 nobody
Voila
Ici :  240 replying
Voila
Ici :  231 begging
Voila
Ici :  231 refund
Voila
Ici :  231 app
Voila
Ici :  231 month
Voila
Ici :  231 nobody
Voila
Ici :  231 replying
Voila
Ici :  240 costly
Voila
Ici :  240 premium
Voila
Ici :  240 version
Voila
Ici :  240 approx
Voila
Ici :  240 indian
Voila
Ici :  240 rupees
Voila
Ici :  240 910
Voila
Ici :  240 per
Voila
Ici :  240 year
Voila
Ici :  240 better
Voila
Ici :  240 download
Voila
Ici :  240 premium
Voila
Ici :  240 version
Voila
Ici :  240 app
Voila
Ici :  240 apkmos
Voila
Ici :  240 website
Voila
Ici :  240 use
Voila
Ici :  240 microsoft
Voila
Ici :  240 list
Voila
Ici :  240 app
Voila
Ici :  231 costly
Voila
Ici :  231 premium
Voila
Ici :  231

In [14]:
new_data.glo_final_data[0]

(array([[ 0.59750998, -0.43505001,  0.50204998, -0.36017001,  0.57239002,
          0.029651  ,  0.45818999,  0.49276999, -0.085655  ,  0.014515  ,
          0.62682003,  0.68755001,  0.019485  , -0.24884   ,  0.80686998,
          1.26450002,  0.57857001, -0.34103   ,  0.68662   , -0.84583002,
         -0.28850001, -0.175     ,  0.53579003,  0.33212   ,  0.59337997,
         -1.85640001, -0.39061001, -0.34753001,  0.75788999, -0.48620999,
          3.13969994,  0.37391001, -0.93760997, -0.83118999, -0.26350999,
         -0.28465   ,  0.42142999, -0.13558   , -0.21163   , -0.22854   ,
         -0.16832   , -0.39381   ,  0.62153   ,  0.75879002, -0.16856   ,
          0.022767  , -0.50844002,  0.33201   , -0.02658   ,  0.068742  ],
        [-0.062761  ,  0.81904   , -0.067769  ,  1.07280004, -0.48884001,
         -0.53658998, -0.39511999,  0.29684001, -0.24993999, -0.64616001,
         -0.38993999, -1.01810002, -0.74659002,  0.56379998,  0.62756002,
          0.84302998,  0.1374    , -0

In [88]:
# new_data.glove

In [69]:
# new_data.word2vec

In [66]:
len(new_data.word2vec)

241

In [67]:
len(new_data.glove)

232

In [27]:
input_size=len(new_data.glove)*20

In [26]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [220]:


class FcNeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(FcNeuralNet, self).__init__()
        self.fc1= nn.Linear(input_dim, hidden_dim)
        print("b"),
        self.fc2= nn.Linear(hidden_dim, num_classes)
        print("d")
    
    def forward(self, x):
      """
      The forward pass of the fully connected layer
      """
      print(x.shape)
      out = F.relu(self.fc1(x))
      #out=out
      print(out)
      out = self.fc2(out)
      print("C")
      out=F.relu(out)
      return out

In [140]:
model1=FcNeuralNet(10,1,5)

b
d


In [156]:
# model1.forward(y)

In [80]:
s1=dataset.dataset.iloc[0,0]

In [99]:
s2=dataset.dataset.iloc[3,0]

In [102]:
x=new_data.sentence_representation(words=new_data.glove,sentence=s2,method="glove")

Ici :  231 used
Voila
Ici :  231 keep
Voila
Ici :  231 organized
Voila
Ici :  231 2020
Voila
Ici :  231 updates
Voila
Ici :  231 made
Voila
Ici :  231 mess
Voila
Ici :  231 things
Voila
Ici :  231 cudnt
Ici :  231 u
Voila
Ici :  231 leave
Voila
Ici :  231 well
Voila
Ici :  231 enuf
Voila
Ici :  231 alone
Voila
Ici :  231 guess
Voila
Ici :  231 ur
Voila
Ici :  231 techies
Voila
Ici :  231 feel
Voila
Ici :  231 need
Voila
Ici :  231 keep
Voila
Ici :  231 making
Voila


In [154]:
x=x.float()
y=torch.tensor(x.reshape(1,-1))

  


In [221]:
model2=FcNeuralNet(20*50,20,5)

b
d


In [147]:
y.view(1000)
y.shape

torch.Size([20, 50])

In [149]:
y.shape

torch.Size([1, 1000])

In [222]:
model2.forward(y)

torch.Size([1, 1000])
tensor([[0.4295, 0.0018, 0.0000, 0.0000, 0.4015, 0.6580, 0.0000, 0.2353, 0.0786,
         0.6177, 0.3088, 0.0000, 0.0000, 0.0000, 0.5057, 0.0000, 0.3850, 0.6514,
         0.0000, 0.0000]], grad_fn=<ReluBackward0>)
C


tensor([[0.0000, 0.3021, 0.0000, 0.0000, 0.1566]], grad_fn=<ReluBackward0>)