In [1]:
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset,DataLoader

import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
tokenize = get_tokenizer('basic_english')

In [3]:
length = 60000
dataset = pd.read_csv("./reviews.csv")[:length]
dataset.drop(["Reply", "Total_thumbsup", "Time_submitted"], axis=1, inplace=True)
dataset.dropna(inplace=True)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 0 to 59999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  60000 non-null  object
 1   Rating  60000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


In [5]:
features = dataset.Review
labels = dataset.Rating

In [6]:
labels.replace([1,2,3], "Not Good", inplace= True)
labels.replace([4,5], "Good", inplace= True)

In [7]:
classes = labels.unique()

for i in classes:
    print(i, (labels==i).sum())

Good 29257
Not Good 30743


In [8]:
def yield_tokens(features):
    for feature in features:
        yield tokenize(feature)

In [9]:
vocab = build_vocab_from_iterator(yield_tokens(features), specials=["temp_token"])
vocab.set_default_index(vocab["temp_token"])
len(vocab)

29432

In [10]:
labels.replace("Not Good", 0, inplace=True)
labels.replace("Good", 1, inplace=True)

In [11]:
labels = torch.tensor(labels, requires_grad=False)
labels = F.one_hot(labels)
labels = torch.tensor(labels, dtype=torch.float32, requires_grad=False)

  labels = torch.tensor(labels, dtype=torch.float32, requires_grad=False)


In [12]:
labels[:10]

tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.]])

In [13]:
max_review = 60
t_fs = torch.zeros(length, max_review, requires_grad=False, dtype=torch.int)

for i in range(length):
    tokens = vocab(tokenize(features[i]))
    end = len(tokens) if len(tokens)<max_review else max_review
    
    for j in range(end):
        t_fs[i,j] = tokens[j]

t_fs[0]

tensor([  47,   12,  151,    8,    3,  308,   11,  624,  171,    7,    3,    9,
          11,  135,    4,   58,    1,   93,   63, 1126,    7,  679,  374,    1,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       dtype=torch.int32)

In [14]:
class RNN(torch.nn.Module):
    def __init__(self,num_embeds):
        super().__init__()
        self.embed = torch.nn.Embedding(len(vocab),num_embeds)
        self.block = torch.nn.Sequential(
            torch.nn.Linear(num_embeds,10),
            torch.nn.ReLU(),
            torch.nn.Linear(10,2),
            torch.nn.Softmax(-1)
        )

    def forward(self,features):
        preds = torch.zeros(len(features), 2, dtype=torch.float32, requires_grad=True)

        for i in range(len(features)):
            feature = self.embed(features[i])

            for token in feature:
                out = self.block(token)

            preds[i] = out
            


        return preds

In [15]:
model = RNN(30)
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [16]:
train_test_split = 0.9

train_length = int(length*train_test_split)
test_length = length - train_length

train_dataset = TensorDataset(t_fs[:train_length],labels[:train_length])
test_dataset = TensorDataset(t_fs[train_length:],labels[train_length:])

dataloader = DataLoader(train_dataset, batch_size = 540, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 500)

In [17]:
epochs = range(1,11)

for epoch in epochs:
    total_loss = 0
    
    for features,lables in dataloader:
        preds = model(features)
        loss = loss_fn(preds, lables)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        with torch.no_grad():
            total_loss += loss



    with torch.no_grad():
        print(epoch)
        print(total_loss/train_length)
        right = 0
        for features,lables in test_dataloader:
            preds = model(features)

            for i in range(len(preds)):
                pred_i = torch.where(preds[i] == preds[i].max())
                label_i = torch.where(labels[i] == 1)

                if pred_i == label_i:
                    right+=1
                
        acc = right/test_length
        print(round(acc*100,2),"%")

        

RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.