In [1]:
import re,os,json

import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils import shuffle

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')

# finds the root of a word (any tense it may be in)
stemmer = PorterStemmer()
# to find the actual root word
lemmater = WordNetLemmatizer()
# stop words which carry no real meaning in a bag of words representation
stopper = stopwords.words('english') + ["he", "she", "his","it","is"]
# regular expression containg useless punctuations
punc = re.compile(r'''[",.<>/();:'-_=?!|0-9]''')

In [3]:
if_vocab = "imdb_vocab.json" in os.listdir()
if_trained = "imdb.pth" in os.listdir()

In [4]:
data = pd.read_csv("imdd.csv")
data = shuffle(data)

labels = np.array(data["sentiment"])
features = np.array(data["review"])

if not if_trained:

    for i in range(len(features)):
        # print(len(features[i]))
        # print(features[i][:155])
        for word in stopper:
            features[i] = features[i].replace(" "+word+" "," ")
            features[i] = features[i].replace("<br />"," ").replace("<br/>"," ")
        # print(len(features[i]))
        # print(features[i][:155])

        features[i] = punc.sub("",features[i])
        # print(len(features[i]))
        # print(features[i][:155])

        tokens = word_tokenize(features[i])
        # print(len(tokens))
        # print(tokens[:15])

        for j in range(len(tokens)):
            if "http" in tokens[j]:
                tokens[j] = ""
            tokens[j] = lemmater.lemmatize(tokens[j])
            # tokens[j] = stemmer.stem(tokens[j])
        
        features[i] = np.array(tokens)

In [5]:
if not if_vocab:
    vocab = {}

    for tokens in features:
        for token in tokens:
            try:
                vocab[token]+=1
            except:
                vocab[token]=1

    i = 0
    for word in list(vocab.keys()):
        if vocab[word] <= 52:
            vocab.pop(word)
        else:
            vocab[word] = i
            i+=1
    with open("imdb_vocab.json", "w") as file:
        json.dump(vocab,file)
else:
    with open("imdb_vocab.json", "r") as file:
        vocab = json.load(file)
        print("Vocabulary Loaded")


vocab_len = len(vocab)
vocab_len

Vocabulary Loaded


10059

In [6]:
if not if_trained:
    final_features = torch.zeros(50000,vocab_len, dtype=torch.float32, requires_grad=False)
    print(final_features.shape)
    for i in range(len(features)):
        for j in range(len(features[i])):
            index = vocab.get(features[i][j],None)
            if index:
                final_features[i][index] += 1

torch.Size([50000, 10059])


In [7]:
if not if_trained:
    final_labels = torch.zeros(50000,1, dtype=torch.float32, requires_grad=False)
    print(final_labels.shape)
    for i in range(len(labels)):
        if labels[i] == "positive":
            final_labels[i] = 1
        else:
            final_labels[i] = 0

torch.Size([50000, 1])


In [8]:
model = torch.nn.Sequential(
    torch.nn.Linear(vocab_len,1000),
    torch.nn.ReLU(),
    torch.nn.Linear(1000,10),
    torch.nn.ReLU(),
    torch.nn.Linear(10,1),
    torch.nn.Sigmoid()
)

In [9]:
if not if_trained:
    loss_fn = torch.nn.BCELoss()
    opt = torch.optim.Adam(model.parameters(), lr = 1e-2)
    print("Loss Function and Optimizer created")

Loss Function and Optimizer created


In [10]:
if not if_trained:
    BATCH_SIZE = 200

    dataset = TensorDataset(final_features[:45000], final_labels[:45000])
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    test_dataset = TensorDataset(final_features[45000:], final_labels[45000:])
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    print("Dataloaders Created")

Dataloaders Created


In [11]:
if not if_trained:
    epochs = 5
else:
    epochs = 0

for epoch in range(epochs):
    for f,ls in dataloader:
        preds = model(f)
        loss = loss_fn(preds,ls)

        loss.backward()
        opt.step()
        opt.zero_grad()

    with torch.no_grad():
        right = 0
        for f,ls in test_dataloader:
            preds = model(f)
            
            for i in range(len(preds)):
                if (preds[i] >= 0.5) == ls[i]:
                    right += 1
    
    # if (epoch+1)%(epochs//10) == 0:
    print(f"Epoch : {epoch+1}\nLoss : {loss}\nAccuracy : {round((right*100)/(BATCH_SIZE*len(test_dataloader)),5)}")
        

Epoch : 1
Loss : 0.2801854610443115
Accuracy : 93.9
Epoch : 2
Loss : 0.24062980711460114
Accuracy : 97.6
Epoch : 3
Loss : 0.08529292047023773
Accuracy : 99.45
Epoch : 4
Loss : 0.04041421413421631
Accuracy : 99.66
Epoch : 5
Loss : 0.00128135085105896
Accuracy : 99.74


In [12]:
if not if_trained:
    torch.save(model.state_dict(), "imdb.pth")
else:
    model.load_state_dict(torch.load("imdb.pth"))

In [13]:
def predict(string):
    for word in stopper:
        string = string.replace(" "+word+" "," ")
        string = punc.sub("",string)
    
    tokens = word_tokenize(string)
    
    if len(tokens) <= 10:
        raise Exception("Sentence too small to make prediction")
        
    for j in range(len(tokens)):
        tokens[j] = lemmater.lemmatize(tokens[j])
    print(tokens)
    pred_features = torch.zeros(1,vocab_len, dtype=torch.float32)

    for i in range(len(tokens)):
        index = vocab.get(tokens[i],None)
        if index:
            # print(tokens[i],end=" ")
            pred_features[0][index] += 1

    pred = model(pred_features)
    return round(float(pred[0])*100,2)       
        

In [14]:
predict('''
I went into this film with very high expectations. The film delivered plenty at many times, but others I found myself quite bored.

I was tempted to write out all the issues I had that took away the 3 stars, but this film is really just a fun, happy thing that everyone loves and isn't arguing about, and how often do we get that?

So I'll just say it's worth the watch and leave it at that.
''')

['went', 'film', 'high', 'expectation', 'film', 'delivered', 'plenty', 'many', 'time', 'others', 'found', 'quite', 'bored', 'tempted', 'write', 'issue', 'took', 'away', 'star', 'film', 'really', 'fun', 'happy', 'thing', 'everyone', 'love', 'isnt', 'arguing', 'often', 'get', 'that', 'o', 'say', 'worth', 'watch', 'leave', 'that']


95.39

In [15]:
predict('''
 When weird and whacky things click. You know you are in for a fun ride. No matter, if the humour is for you or not. There is a lot to like about this show, as if it has a certain appeal. Like the fact someone actually returned home from an isekai. How many can you name have accomplished this??? It’s new and creative.

Art and sound: 8

The uncle can be very hit or miss for me. It’s weird, but his situation is quite understandable. So no matter how you feel on him, or his choices and personality. One fact is. You can least understand his situation.

The show really is a wild bag. But it has an interesting premise to keep it ticking and more so. Give it a go.
Reviewer’s Rating: 8 
''')

['hen', 'weird', 'whacky', 'thing', 'click', 'ou', 'know', 'fun', 'ride', 'matter', 'humour', 'lot', 'like', 'show', 'certain', 'appeal', 'ike', 'fact', 'someone', 'actually', 'returned', 'home', 'isekai', 'ow', 'many', 'name', 'accomplished', 't', '’', 's', 'new', 'creative', 'rt', 'sound', 'he', 'uncle', 'hit', 'miss', 't', '’', 's', 'weird', 'situation', 'quite', 'understandable', 'matter', 'feel', 'choice', 'personality', 'ne', 'fact', 'ou', 'least', 'understand', 'situation', 'he', 'show', 'really', 'wild', 'bag', 'ut', 'interesting', 'premise', 'keep', 'ticking', 'ive', 'go', 'eviewer', '’', 's', 'ating']


100.0

In [16]:
predict('''
Go see this movie if you:

    are infatuated with Tom Cruise
    love US armed forces promos
    love, but don't know much about fighter jets, military air operations, or the military in general
    love romantic stories
    don't care about facts or logic

Otherwise, you will be disappointed, to say the least.
''')

['o', 'see', 'movie', 'you', 'infatuated', 'om', 'ruise', 'love', 'armed', 'force', 'promos', 'love', 'dont', 'know', 'much', 'fighter', 'jet', 'military', 'air', 'operation', 'military', 'general', 'love', 'romantic', 'story', 'dont', 'care', 'fact', 'logic', 'therwise', 'disappointed', 'say', 'least']


0.0

In [17]:
predict('''
The training is painfully uninteresting and cheesy. The smiling is fine because it's expected and creates a positive atmosphere for the viewer. It also makes the viewer more attached to the character when that character is in trouble later in the movie.

The actual mission is the last third and I have the most to say about that. I should point out that it probably was not a third in terms of the actual portion of the movie that it covered. More accurately, it's probably about a fifth of the movie or less. This was the best part of the movie, but even this was not great or even that good. As mentioned, it's a fairly smaller part of the movie. But beyond that, the scenes are completely predictable. Now, just because something is predictable does not necessarily make it bad. The feeling here is that this action-part of the movie feels like watching the trailer over and over for 20-30 minutes. There is nothing that unique about the entire scene. The only positive aspect of it is that it's not boring.

One last note, the storyline, it almost doesn't exist. The storyline is that there's a mission that has to be completed. That's it. In summation, the movie leaves a lot to be desired.
''')

['he', 'training', 'painfully', 'uninteresting', 'cheesy', 'smiling', 'fine', 'expected', 'creates', 'positive', 'atmosphere', 'viewer', 'also', 'make', 'viewer', 'attached', 'character', 'character', 'trouble', 'later', 'movie', 'he', 'actual', 'mission', 'last', 'third', 'say', 'point', 'probably', 'third', 'term', 'actual', 'portion', 'movie', 'covered', 'ore', 'accurately', 'probably', 'fifth', 'movie', 'le', 'best', 'part', 'movie', 'even', 'great', 'even', 'good', 'mentioned', 'fairly', 'smaller', 'part', 'movie', 'ut', 'beyond', 'scene', 'completely', 'predictable', 'ow', 'something', 'predictable', 'necessarily', 'make', 'bad', 'feeling', 'actionpart', 'movie', 'feel', 'like', 'watching', 'trailer', 'over', 'minute', 'nothing', 'unique', 'entire', 'scene', 'positive', 'aspect', 'boring', 'ne', 'last', 'note', 'storyline', 'almost', 'doesnt', 'exist', 'storyline', 'there', 'mission', 'completed', 'hat', 'n', 'summation', 'movie', 'leaf', 'lot', 'desired']


0.0

In [18]:
predict('''
to be honest i wasn't sure about watching this one, but after actually watching the first episode it completely changed my thoughts on this anime. As i continued streaming it, i found out how unique it was, the world building, the story line that was progressing smoothly, and all of the above the main charactrer that was not the actual main character, but was one of them.  the antigonist had a different and amazing storyline which was worth watiching. To conclude, it was a whole new experiance, i dont regret watching. 
''')

['to', 'honest', 'wasnt', 'sure', 'watching', 'one', 'actually', 'watching', 'first', 'episode', 'completely', 'changed', 'thought', 'anime', 'continued', 'streaming', 'found', 'unique', 'world', 'building', 'story', 'line', 'progressing', 'smoothly', 'main', 'charactrer', 'actual', 'main', 'character', 'one', 'antigonist', 'different', 'amazing', 'storyline', 'worth', 'watiching', 'conclude', 'whole', 'new', 'experiance', 'dont', 'regret', 'watching']


99.5

In [19]:
predict('''
poor cinematograpy, drunk actors, only body showing through out the movie, absolute piece of trash, please dont watch it in cinemas, maybe when you are high and have nothing to do you can occupy your mind by watching this
''')

['poor', 'cinematograpy', 'drunk', 'actor', 'body', 'showing', 'movie', 'absolute', 'piece', 'trash', 'please', 'dont', 'watch', 'cinema', 'maybe', 'high', 'nothing', 'occupy', 'mind', 'watching', 'this']


0.0