---

# Ioannou_Georgios


## Copyright © 2023 by Georgios Ioannou


---

<h1 align="center"> Natural Language Processing </h1>


---

### Evaluating Models


---

# LIBRARIES


In [1]:
# Import libraries.

import nltk
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn

from nltk import Counter
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zacharydesario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Load the IMDB data set into a pandas data frame.

# Read the file IMDB Dataset.csv and load the data. GOOD, CLEAN, ORGANIZED DATASET.

df = pd.read_csv("archive.zip")

# Print/Display/Return the first 5 rows of the file IMDB Dataset.csv to make sure the file was loaded successfully.

df.head()

df = df.sample(frac=0.2)

In [10]:
X = df[
    "review"
].values  # Return a Numpy representation of the DataFrame to use in the train_test_split.

In [11]:
y = df[
    "sentiment"
].values  # Return a Numpy representation of the DataFrame to use in the train_test_split.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters).

    s = re.sub(r"[^\w\s]", "", s)

    # Replace all runs of whitespaces with no space.

    s = re.sub(r"\s+", "", s)

    # Replace digits with no space.

    s = re.sub(r"\d", "", s)

    return s

In [14]:
def tokenize(X_train, y_train, x_val, y_val):
    word_list = []

    stop_words = set(stopwords.words("english"))

    for sent in X_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != "":
                word_list.append(word)

    # corpus: A collection of words.

    corpus = Counter(word_list)

    # Sorting on the basis of most common words.

    corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:1000]

    # Create a dictionary.

    onehot_dict = {w: i + 1 for i, w in enumerate(corpus_)}

    # Tokenize.

    final_list_train, final_list_test = [], []

    for sent in X_train:
        final_list_train.append(
            [
                onehot_dict[preprocess_string(word)]
                for word in sent.lower().split()
                if preprocess_string(word) in onehot_dict.keys()
            ]
        )

    for sent in x_val:
        final_list_test.append(
            [
                onehot_dict[preprocess_string(word)]
                for word in sent.lower().split()
                if preprocess_string(word) in onehot_dict.keys()
            ]
        )

    encoded_train = [1 if label == "positive" else 0 for label in y_train]

    encoded_test = [1 if label == "positive" else 0 for label in y_val]

    return (
        np.array(final_list_train, dtype=object),
        np.array(encoded_train),
        np.array(final_list_test, dtype=object),
        np.array(encoded_test),
        onehot_dict,
    )

In [15]:
X_train, y_train, X_test, y_test, vocab = tokenize(X_train, y_train, X_test, y_test)

In [16]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review) :] = np.array(review)[:seq_len]
    return features

In [17]:
# Check if a CUDA-capable GPU is available, and set 'device' accordingly.

if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Print the selected device (either 'cuda' or 'cpu') to the console.

print(device)

# Check if CUDA (GPU support) is available.

if torch.cuda.is_available():
    # Print the number of available CUDA devices.

    print(torch.cuda.device_count())

    # Print the name of the first CUDA device.

    print(torch.cuda.get_device_name(0))
else:
    # If no GPU is available, print a message.

    print("No GPU available.")

cpu
No GPU available.


In [18]:
def predict_text(text, model):
    word_seq = np.array(
        [
            vocab[preprocess_string(word)]
            for word in text.split()
            if preprocess_string(word) in vocab.keys()
        ]
    )

    word_seq = np.expand_dims(word_seq, axis=0)

    pad = torch.from_numpy(padding_(word_seq, 500))

    inputs = pad.to(device)

    batch_size = 1

    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])

    output, h = model(inputs, h)

    return output.item()

In [20]:
# Model class definition.


class SentimentRNN(nn.Module):
    def __init__(
        self,
        no_layers,
        vocab_size,
        output_dim,
        hidden_dim,
        embedding_dim,
        drop_prob=0.5,
    ):
        super(SentimentRNN, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.no_layers = no_layers
        self.vocab_size = vocab_size

        # Embedding layers.

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers.

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=self.hidden_dim,
            num_layers=no_layers,
            batch_first=True,
        )

        # Dropout layer.

        self.dropout = nn.Dropout(0.3)

        # Linear layer. (Fully Connected Layer)

        self.fc = nn.Linear(self.hidden_dim, output_dim)

        # Sigmoid layer.

        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)

        embeds = self.embedding(x)

        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        out = self.dropout(lstm_out)

        out = self.fc(out)

        sig_out = self.sig(out)

        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1]

        return sig_out, hidden

    def init_hidden(self, batch_size):
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM

        h0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)

        hidden = (h0, c0)

        return hidden

In [21]:
# Model class parameters.

no_layers = 2
vocab_size = len(vocab) + 1  # Extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256

# Create an instance of the SentimentRNN class.

loaded_model = SentimentRNN(
    no_layers, vocab_size, output_dim, hidden_dim, embedding_dim, drop_prob=0.5
).to(device)

# Load the trained model's parameters.

loaded_model.load_state_dict(
    torch.load("my_model_1.pt", map_location=torch.device(device))
)
# loaded_model.load_state_dict(
#     torch.load("my_model_2.pt", map_location=torch.device(device))
# )

# Set the model to evaluation mode.

loaded_model.eval()

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [22]:
# 9/10 IMDB Review
# https://www.imdb.com/title/tt15398776/reviews?sort=curated&dir=asc&ratingFilter=9
# REVIEWS FROM Oppenheimer (2023) NOT IN THE DATASET.

In [23]:
new_positive_review = "You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return to proper acting after his decade or so of calling it in. The screenplay is dense and layered (I'd say it was a thick as a Bible), cinematography is quite stark and spare for the most part but imbued with rich, lucious colour in moments (especially scenes with Florence Pugh), the score is beautiful at times but mostly anxious and oppressive, adding to the relentless pacing. The 3 hour runtime flies by. All in all I found it an intense, taxing but highly rewarding watch. This is film making at it finest. A really great watch."

print("New POSITIVE review:\n", new_positive_review)

pro = predict_text(new_positive_review, loaded_model)
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro

print(f"\nPredicted sentiment is {status.upper()} with a probability of {pro * 100}%")

New POSITIVE review:
 You'll have to have your wits about you and your brain fully switched on watching Oppenheimer as it could easily get away from a nonattentive viewer. This is intelligent filmmaking which shows it's audience great respect. It fires dialogue packed with information at a relentless pace and jumps to very different times in Oppenheimer's life continuously through it's 3 hour runtime. There are visual clues to guide the viewer through these times but again you'll have to get to grips with these quite quickly. This relentlessness helps to express the urgency with which the US attacked it's chase for the atomic bomb before Germany could do the same. An absolute career best performance from (the consistenly brilliant) Cillian Murphy anchors the film. This is a nailed on Oscar performance. In fact the whole cast are fantastic (apart maybe for the sometimes overwrought Emily Blunt performance). RDJ is also particularly brilliant in a return to proper acting after his decade

In [24]:
# 5/10 IMDB Review
# https://www.imdb.com/title/tt15398776/reviews?sort=curated&dir=asc&ratingFilter=5
# REVIEWS FROM Oppenheimer (2023) NOT IN THE DATASET.

In [25]:
new_average_review = 'This must be the most overrated film of the year.Like every other typical American biographical movie, it glorifies its subject. According to Nolan, Oppenheimer is the most important person who ever lived. Really?The movie contains at least 1.5 hours of uninteresting courtroom drama.There are far too many characters.Not a single compelling dialogue about the moral impact of the bomb.The continuous music propels the film, but as a result there is zero emotional impact when needed.Excellent acting by all, though, and occasional nice directorial effects.Most ridiculous moment of the film: While having sex with Oppenheimer, Florence Pughs character randomly selects a sentence in a book in Sanskrit which just happens to be the infamous "Now I Am Become Death, the Destroyer of Worlds" quote.'

print("New AVERAGE review:\n", new_average_review)

pro = predict_text(new_average_review, loaded_model)
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro

print(f"\nPredicted sentiment is {status.upper()} with a probability of {pro * 100}%")

New AVERAGE review:
 This must be the most overrated film of the year.Like every other typical American biographical movie, it glorifies its subject. According to Nolan, Oppenheimer is the most important person who ever lived. Really?The movie contains at least 1.5 hours of uninteresting courtroom drama.There are far too many characters.Not a single compelling dialogue about the moral impact of the bomb.The continuous music propels the film, but as a result there is zero emotional impact when needed.Excellent acting by all, though, and occasional nice directorial effects.Most ridiculous moment of the film: While having sex with Oppenheimer, Florence Pughs character randomly selects a sentence in a book in Sanskrit which just happens to be the infamous "Now I Am Become Death, the Destroyer of Worlds" quote.

Predicted sentiment is POSITIVE with a probability of 50.37773847579956%


In [26]:
# 1/10 IMDB Review
# https://www.imdb.com/title/tt15398776/reviews?sort=curated&dir=asc&ratingFilter=1
# REVIEWS FROM Oppenheimer (2023) NOT IN THE DATASET.

In [19]:
new_negative_review = 'After you watch Oppenheimer and leave the movie theater, you ask yourself: "What was the point of it?" Unfortunately the answer is not clear at all. I strongly beleive that a movie about the scientist who played a key role in the creation of the atomic bomb should unquestionably emphasize on the consequences of this invention and send strong peaceful messages to the audience! Especially in the world we live in today! However, it looks like in the new movie Oppenheimer they almost ignore the tragic facts that hundreds of thousands of innocent people died from the first 2 bombs and that the whole world changed after Oppenheim\'s invention, which was obviously the biggest drama in this one human being\'s life. They emphasize and build the story in the movie around the fact that Oppenheimer was later accused of being a spy of the Soviets (who were allies to the Americans in WW2...).I feel sad that this will be an award winning movie and that most people in the audience will say (or feel they have to say) "A! Oh!! Great movie!", because the director, writers, cast are famous and the topic is important - all these superlatives about a somewhat boring movie, decorated with some nudity (not clear either why this was needed and related to the topic) and overall pointless movie... Or, at least, much more pointless than it should be!'

print("New NEGATIVE review:\n", new_negative_review)

pro = predict_text(new_negative_review, loaded_model)
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro

print(f"\nPredicted sentiment is {status.upper()} with a probability of {pro * 100}%")

New NEGATIVE review:
 After you watch Oppenheimer and leave the movie theater, you ask yourself: "What was the point of it?" Unfortunately the answer is not clear at all. I strongly beleive that a movie about the scientist who played a key role in the creation of the atomic bomb should unquestionably emphasize on the consequences of this invention and send strong peaceful messages to the audience! Especially in the world we live in today! However, it looks like in the new movie Oppenheimer they almost ignore the tragic facts that hundreds of thousands of innocent people died from the first 2 bombs and that the whole world changed after Oppenheim's invention, which was obviously the biggest drama in this one human being's life. They emphasize and build the story in the movie around the fact that Oppenheimer was later accused of being a spy of the Soviets (who were allies to the Americans in WW2...).I feel sad that this will be an award winning movie and that most people in the audience 