In [7]:
from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

#Setting devices for further training cuda="Discrete GPU" and CPU

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [9]:
#Downloading and reading the data.

In [10]:
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=DATA_PATH,
    )

In [11]:
pd.read_csv(DATA_PATH).sample(5)

Unnamed: 0,review,label
30598,This must be one of the most overrated Spanish...,0
50271,I watched both Bourne Identity and Bourne Supr...,1
38667,"Blood Castle (aka Scream of the Demon Lover, A...",1
41633,The title of this film nearly put me off watch...,1
57645,"Josh Hartnett's dead eyes, the thick writing o...",0


#We initialize the dataset class and use verctorise from the sklearn library to make it a BagOfWords./
#Then use token2idk to generate the vocabulary.

In [12]:
class Sequences(Dataset):
    def __init__(self, path):
        df = pd.read_csv(path)
        self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.review.tolist())
        self.labels = df.label.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

#Set sequence path and set batch size for the data.

In [13]:
dataset = Sequences(DATA_PATH)
train_loader = DataLoader(dataset, batch_size=4096)

print(dataset[5][0].shape)

(1, 3028)


#Making the bag of words classifier. Using nn in pytorch.

In [14]:
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

Getting the vaules from the classifier that we made.

In [15]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64)
model

BagOfWordsClassifier(
  (fc1): Linear(in_features=3028, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [None]:
#Made a the criterion variable by applying the BCEWithLogitsLoss() for applying Binary Cross Entropy and Sigmoid Layer in one go.

In [16]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [None]:
#Actual Training Section

In [21]:
model.train()
train_losses = []
for epoch in range(100):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        model.zero_grad()

        output = model(inputs)
        loss = criterion(output.squeeze(), target.float())
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
     
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #1	Train Loss: 0.639


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #2	Train Loss: 0.573


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #3	Train Loss: 0.489


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #4	Train Loss: 0.416


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #5	Train Loss: 0.364


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #6	Train Loss: 0.331


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #7	Train Loss: 0.310


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #8	Train Loss: 0.295


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #9	Train Loss: 0.285


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #10	Train Loss: 0.277


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #11	Train Loss: 0.270


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #12	Train Loss: 0.264


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #13	Train Loss: 0.258


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #14	Train Loss: 0.253


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #15	Train Loss: 0.248


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #16	Train Loss: 0.242


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #17	Train Loss: 0.237


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #18	Train Loss: 0.230


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #19	Train Loss: 0.226


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #20	Train Loss: 0.219


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #21	Train Loss: 0.211


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #22	Train Loss: 0.204


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #23	Train Loss: 0.193


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #24	Train Loss: 0.185


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #25	Train Loss: 0.175


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #26	Train Loss: 0.166


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #27	Train Loss: 0.155


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #28	Train Loss: 0.145


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #29	Train Loss: 0.133


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #30	Train Loss: 0.123


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #31	Train Loss: 0.111


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #32	Train Loss: 0.100


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #33	Train Loss: 0.089


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #34	Train Loss: 0.078


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #35	Train Loss: 0.068


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #36	Train Loss: 0.060


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #37	Train Loss: 0.053


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #38	Train Loss: 0.046


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #39	Train Loss: 0.039


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #40	Train Loss: 0.032


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #41	Train Loss: 0.027


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #42	Train Loss: 0.023


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #43	Train Loss: 0.020


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #44	Train Loss: 0.017


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #45	Train Loss: 0.015


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #46	Train Loss: 0.013


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #47	Train Loss: 0.011


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #48	Train Loss: 0.010


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #49	Train Loss: 0.009


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #50	Train Loss: 0.008


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #51	Train Loss: 0.007


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #52	Train Loss: 0.006


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #53	Train Loss: 0.005


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #54	Train Loss: 0.005


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #55	Train Loss: 0.004


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #56	Train Loss: 0.004


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #57	Train Loss: 0.004


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #58	Train Loss: 0.003


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #59	Train Loss: 0.003


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #60	Train Loss: 0.003


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #61	Train Loss: 0.003


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #62	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #63	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #64	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #65	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #66	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #67	Train Loss: 0.002


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #68	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #69	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #70	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #71	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #72	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #73	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #74	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #75	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #76	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #77	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #78	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #79	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #80	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #81	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #82	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #83	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #84	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #85	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #86	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #87	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #88	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #89	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #90	Train Loss: 0.001


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #91	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #92	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #93	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #94	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #95	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #96	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #97	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #98	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #99	Train Loss: 0.000


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #100	Train Loss: 0.000


In [22]:
def predict_review(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray())

        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive Review')
        else:
            print(f'{prediction:0.3}: Negative Review')

#Testing with the worst rated IMDB rated movie 

In [23]:
test_text = """
I am writing this in hopes that this gets put over the previous review of this "film". How anyone can find this slop entertaining is completely beyond me. First of all a spoof film entitled "Disaster Movie", should indeed be a spoof on disaster films. Now I have seen 1 (yes count them, 1) disaster film being spoofed, that being "Twister". How does Juno, Iron Man, Batman, The Hulk, Alvin and the Chipmunks, Amy Winehouse, or Hancock register as Disaster films? Selzterwater and Failburg once again have shown that they lack any sort of writing skill and humor. Having unfortunately been tortured with Date Movie and Epic Movie I know exactly what to expect from these two...no plot, no jokes just bad references and cheaply remade scenes from other films. Someone should have informed them that satire is more than just copy and paste from one film to another, though I shouldn't say that because some of these actually just seem to be taken from trailers.

There is nothing clever or witty or remotely smart about the way these two write, and I can't believe that some people still pay to see these travesties. It's an insult to the audience, though if they enjoy these films I doubt that they are smart enough to realize that.

Rating: Unfortunately there is not a number low enough (yes this includes negatives) to rate this. This deserves to be in the top 5 worst films of all time, right there with Date Movie, Epic Faliure...I mean movie, and Meet the Spartans. I would rather be forced into a 24 hour "Manos: The Hands of Fate" marathon than watch this slop."""
predict_review(test_text)

1.13e-06: Negative Review


#Testing with Shawshank Redemption (Highest Rated Movie)

In [24]:
test_text= """ Why do I want to write the 234th comment on The Shawshank Redemption? I am not sure - almost everything that could be possibly said about it has been said. But like so many other people who wrote comments, I was and am profoundly moved by this simple and eloquent depiction of hope and friendship and redemption.

The only other movie I have ever seen that effects me as strongly is To Kill a Mockingbird. Both movies leave me feeling cleaner for having watched them.

I didn't intend to see this movie at all: I do not like prison movies and I don't normally watch them. I work at a branch library and one day as I was checking The Shawshank Redemption out to one of our older patrons, she said to me, "Whenever I feel down or depressed, I check out this movie and watch it and it always makes me feel better." At the time, I thought that was very strange. One day there was nothing on TV except things I absolutely would not watch under any circumstance or things that I had seen too many times already. I remembered what she said, so I watched it. I have watched it many many times since then and it gets better with every showing.

No action, no special effects - just men in prison uniforms talking to each other.

The Shawshank Redemption and To Kill a Mockingbird are the best movies I have ever seen. I do not judge it by it's technical merits - I don't really care about that. I have read that Citizen Kane or The Godfather or this or that movie is the best movie ever made. They may have the best technique or be the most influential motion pictures ever made, but not the best. The best movies are ones that touch the soul. It takes a movie like The Shawshank Redemption to touch the soul."""
predict_review(test_text)

1.0: Positive Review
