# Assignment 2

## 1. Loading packages and data

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.stem import PorterStemmer
import nltk
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords

pd.set_option('display.max_colwidth', None)  # Allows columns to display their full content so we can read the full reviews
pd.set_option('display.width', 200) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kkasp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\kkasp\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [19]:
file_path = r'IMDB Dataset.csv'
data = pd.read_csv(file_path)
print(data.head(20))


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [20]:
X = data['review']

def transform_bin(x):
  if x == "positive":
    return 1
  else:
    return 0
data['sentiment'] = data['sentiment'].apply(lambda x : transform_bin(x))

y = data['sentiment']
sentiment_counts = y.value_counts()
print("Class distribution in 'sentiment':\n", sentiment_counts)
print("50/50 class distribution")

Class distribution in 'sentiment':
 sentiment
1    25000
0    25000
Name: count, dtype: int64
50/50 class distribution


In [21]:
# Data split (80% training, 20% validation)
train_data, X_test, train_label, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

X_train = X_train.to_frame()
X_test = X_test.to_frame()
X_val = X_val.to_frame()

# Display the sizes of the datasets
print("Training Data Size:", len(X_train))
print("Validation Data Size:", len(X_val))
print("Test Data Size:", len(X_test))

Training Data Size: 32000
Validation Data Size: 8000
Test Data Size: 10000


In [22]:
# Training set distribution
train_counts = y_train.value_counts()
print("Class Distribution in Training Set:\n", train_counts)

# Validation set distribution
val_counts = y_val.value_counts()
print("\nClass Distribution in Validation Set:\n", val_counts)

# Test set distribution
test_counts = y_test.value_counts()
print("\nClass Distribution in Test Set:\n", test_counts)

Class Distribution in Training Set:
 sentiment
0    16080
1    15920
Name: count, dtype: int64

Class Distribution in Validation Set:
 sentiment
1    4041
0    3959
Name: count, dtype: int64

Class Distribution in Test Set:
 sentiment
1    5039
0    4961
Name: count, dtype: int64


## 2. Text pre-processing 

In [23]:
def clean_text(text):
    # Remove unusual or non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the text into words for processing
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]


    return filtered_words

X_train['cleaned_review'] = X_train['review'].apply(clean_text)
X_val['cleaned_review']  = X_val['review'].apply(clean_text)
X_test['cleaned_review']  = X_test['review'].apply(clean_text)


# Display the first few cleaned reviews
print("Cleaned Reviews with Stopwords Removed:")
print(X_train['cleaned_review'].head(5))

Cleaned Reviews with Stopwords Removed:
11794                                                                                                                                                                                                                                                           [fault, actors, put, great, performances, overall, story, well, executed, movie, opens, great, zinger, crazy, old, guy, forces, young, Aborigine, girls, car, road, forced, endure, 40, minutes, character, development, entirely, new, group, characters, dont, know, 40, minutes, turns, ones, eventually, discover, girls, body, story, progresses, therebr, br, story, pick, point, really, goes, nowhere, 2, hours, asked, point, see, characters, struggle, accusations, racism, stupidity, handled, discovery, story, ultimately, unsatisfying, felt, unfinished, well, acted, theres, strong, enough, backbone, film, warrant, recommending]
24925                                                                      

## 3. Further preprocessing: Features and Embeddings

#### Features

Here we create: Number of Positive lexicon words, Number of Negative lexicon words, Number of nos, and Number of words. After that the values were standardized

In [24]:
# 1. Lexicon Creation
positive_lexicon = set(opinion_lexicon.positive()) #list of positive words from the NLTK package
negative_lexicon = set(opinion_lexicon.negative()) #list of negative words from the NLTK package

# 2. Defining Functions
def counts_lex(review, type = 'neg'):
    count_neg = 0
    count_pos = 0
    for word in review:
      if word.lower() in positive_lexicon:
        count_pos += 1
      if word.lower() in negative_lexicon:
        count_neg += 1
    if type == 'neg':
      return count_neg
    elif type == 'pos':
      return count_pos

# Ensure reviews are tokenized, we use the review column as 'no' and 'not' gets removed by stopwords
def count_nos(review):
    count = 0
    words = review.split()  # Tokenize if review column is a single string
    for word in words:
        if word.lower() == "no" or word.lower() == "not":
            count += 1
    return count

# 3. Feature Extraction for Training, Validation, and Test Sets
X_train['NrPos'] = X_train['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_train['NrNeg'] = X_train['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_train['NrWords'] = X_train['cleaned_review'].apply(lambda x: len(x))
X_train['Nos'] = X_train['review'].apply(lambda x : count_nos(x))

X_val['NrPos'] = X_val['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_val['NrNeg'] = X_val['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_val['NrWords'] = X_val['cleaned_review'].apply(lambda x: len(x))
X_val['Nos'] = X_val['review'].apply(lambda x : count_nos(x))

X_test['NrPos'] = X_test['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_test['NrNeg'] = X_test['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_test['NrWords'] = X_test['cleaned_review'].apply(lambda x: len(x))
X_test['Nos'] = X_test['review'].apply(lambda x : count_nos(x))

# 4. Standardizing Features
cols = ['NrPos', 'NrNeg', 'NrWords', 'Nos']
train_features_to_transform = X_train[cols]
val_features_to_transform = X_val[cols]
test_features_to_transform = X_test[cols]

scaler = StandardScaler()
X_train_std = scaler.fit_transform(train_features_to_transform)  # X is your feature matrix
X_val_std = scaler.transform(val_features_to_transform)
X_test_std = scaler.transform(test_features_to_transform)

X_train[cols] = X_train_std
X_val[cols] = X_val_std
X_test[cols] = X_test_std

# 5. Stemming Words
# Apply stemming
stemmer = PorterStemmer()

X_train['Stem Words'] = X_train['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])
X_val['Stem Words'] = X_val['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])
X_test['Stem Words'] = X_test['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])

# 6. Output for debugging
# Display the first few cleaned reviews
print("Training Data with cleaned reviews with Stemming and New Features:")
print(X_train['cleaned_review'].head(20))

Training Data with cleaned reviews with Stemming and New Features:
11794                                                                                                                                                                                                                                                           [fault, actors, put, great, performances, overall, story, well, executed, movie, opens, great, zinger, crazy, old, guy, forces, young, Aborigine, girls, car, road, forced, endure, 40, minutes, character, development, entirely, new, group, characters, dont, know, 40, minutes, turns, ones, eventually, discover, girls, body, story, progresses, therebr, br, story, pick, point, really, goes, nowhere, 2, hours, asked, point, see, characters, struggle, accusations, racism, stupidity, handled, discovery, story, ultimately, unsatisfying, felt, unfinished, well, acted, theres, strong, enough, backbone, film, warrant, recommending]
24925                                           

In [25]:
# Save the preprocessed data in a csv file to save time if we want to re-run only the following part
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
X_train

Unnamed: 0,review,cleaned_review,NrPos,NrNeg,NrWords,Nos,Stem Words
11794,"With no fault to the actors (they all put on great performances), the overall story was not very well executed. The movie opens with a great zinger: a crazy old guy forces a young Aborigine girl's car off the road. But then, we're forced to endure 40 minutes of character development with an entirely new group of characters ... and we don't know why until the 40 minutes are up. It turns out that they are the ones who eventually discover the girl's body ... and the story progresses from there.<br /><br />While the story does pick up at that point, it really goes nowhere. After 2 hours, I asked myself: was there a point to this, or was it just to see the characters struggle with accusations of racism and stupidity of how they handled the discovery? The story was ultimately unsatisfying and felt unfinished. While it is well acted, there's not a strong enough backbone in the film to warrant recommending it.","[fault, actors, put, great, performances, overall, story, well, executed, movie, opens, great, zinger, crazy, old, guy, forces, young, Aborigine, girls, car, road, forced, endure, 40, minutes, character, development, entirely, new, group, characters, dont, know, 40, minutes, turns, ones, eventually, discover, girls, body, story, progresses, therebr, br, story, pick, point, really, goes, nowhere, 2, hours, asked, point, see, characters, struggle, accusations, racism, stupidity, handled, discovery, story, ultimately, unsatisfying, felt, unfinished, well, acted, theres, strong, enough, backbone, film, warrant, recommending]",-0.349058,-0.255423,-0.481820,0.761786,"[fault, actor, put, great, perform, overal, stori, well, execut, movi, open, great, zinger, crazi, old, guy, forc, young, aborigin, girl, car, road, forc, endur, 40, minut, charact, develop, entir, new, group, charact, dont, know, 40, minut, turn, one, eventu, discov, girl, bodi, stori, progress, therebr, br, stori, pick, point, realli, goe, nowher, 2, hour, ask, point, see, charact, struggl, accus, racism, stupid, handl, discoveri, stori, ultim, unsatisfi, felt, unfinish, well, act, there, strong, enough, backbon, film, warrant, recommend]"
24925,"The first thing I thought when I saw this films was: It is not really a film, at least it is not what we imagine spontaneously when we hear the word ""film"". it is entirely symbolic, everything in it has a figurative meaning. So if you are not used to express thing in a symbolic way, you will find it strange, if you are not acquainted with philosophy, religion, spiritual life, you will think it's just a fairy-tale... and even a weird one, chaotic. For me ""The legend of Zu"" is perfectly transparent. And I do like it. It tells us in images the story about the fight between light and darkness, the fight that is as old as humanity, and every one who is in search of the sens in this life is confronted with it. The film is obviously made by Buddhists. I am not a Buddhist. My religion and the vision of the world and human is different. But as far as we are all humans and have the same human nature we necessarily have common experiences and can understand each other. It is a really beautiful film! And I which we had more films like this - films that have a meaning. There are too many empty stories which are good only to make time pass more quickly.","[first, thing, thought, saw, films, really, film, least, imagine, spontaneously, hear, word, film, entirely, symbolic, everything, figurative, meaning, used, express, thing, symbolic, way, find, strange, acquainted, philosophy, religion, spiritual, life, think, fairytale, even, weird, one, chaotic, legend, Zu, perfectly, transparent, like, tells, us, images, story, fight, light, darkness, fight, old, humanity, every, one, search, sens, life, confronted, film, obviously, made, Buddhists, Buddhist, religion, vision, world, human, different, far, humans, human, nature, necessarily, common, experiences, understand, really, beautiful, film, films, like, films, meaning, many, empty, stories, good, make, time, pass, quickly]",-0.349058,-0.623733,-0.351669,1.839063,"[first, thing, thought, saw, film, realli, film, least, imagin, spontan, hear, word, film, entir, symbol, everyth, figur, mean, use, express, thing, symbol, way, find, strang, acquaint, philosophi, religion, spiritu, life, think, fairytal, even, weird, one, chaotic, legend, zu, perfectli, transpar, like, tell, us, imag, stori, fight, light, dark, fight, old, human, everi, one, search, sen, life, confront, film, obvious, made, buddhist, buddhist, religion, vision, world, human, differ, far, human, human, natur, necessarili, common, experi, understand, realli, beauti, film, film, like, film, mean, mani, empti, stori, good, make, time, pass, quickli]"
28578,"Post-feminist depiction of cruelty and sadism.<br /><br />Spoiler alert! <br /><br />This underrated gem of a film tells the story of Flavia, a Fifteenth Century girl of Noble birth walled up in a convent after defining her father and indeed the whole of Medieval Christian society by viewing a fallen Islamic warrior as a human rather than demonic figure.<br /><br />Unable to accept the patriarchal rule of the convent (explicitly stated in a scene where the Bishop arrives flanked by soldiers and monks) Flavia begins to explicitly question the society in which she finds herself and, through butting up against a whole system of subjugation, repression and violence, inevitably brings a tragic end not only to herself but all those around her.<br /><br />Billed as a piece of nunsploitation this is far from the truth. This is a film depiction the consequences of violence, the effects of patriarchal dominance, the nature of rebellion and the corruption of the human spirit.<br /><br />I described it in the title of this piece as 'post-feminist' and in the end Flavia's triumphs must always be corrupted, compromised and perverted by men. Even Flavia's gruesome end is perpetrated by men for men (the women turn away and only the monks look on without horror.<br /><br />As to the much discussed violence: this is a depiction of the effects of violence and the horrors of a world driven mad by religious excess. To have shied away from the violence would have limited the film's impact, would have cheapened the film and allowed it to be assimilated within the Patriarchal discourse it is exposing. In addition it is a realistic portrait of medieval society.<br /><br />Beautifully filmed, brilliantly acted (notably by Florinda Bolkin and Maria Casares), containing a wonderful score by piovani and still challenging after all these years Flavia is a classic of European Cinema.","[Postfeminist, depiction, cruelty, sadismbr, br, Spoiler, alert, br, br, underrated, gem, film, tells, story, Flavia, Fifteenth, Century, girl, Noble, birth, walled, convent, defining, father, indeed, whole, Medieval, Christian, society, viewing, fallen, Islamic, warrior, human, rather, demonic, figurebr, br, Unable, accept, patriarchal, rule, convent, explicitly, stated, scene, Bishop, arrives, flanked, soldiers, monks, Flavia, begins, explicitly, question, society, finds, butting, whole, system, subjugation, repression, violence, inevitably, brings, tragic, end, around, herbr, br, Billed, piece, nunsploitation, far, truth, film, depiction, consequences, violence, effects, patriarchal, dominance, nature, rebellion, corruption, human, spiritbr, br, described, title, piece, postfeminist, end, Flavias, triumphs, must, always, corrupted, compromised, perverted, ...]",-0.216966,0.726737,0.516006,-0.315491,"[postfeminist, depict, cruelti, sadismbr, br, spoiler, alert, br, br, underr, gem, film, tell, stori, flavia, fifteenth, centuri, girl, nobl, birth, wall, convent, defin, father, inde, whole, mediev, christian, societi, view, fallen, islam, warrior, human, rather, demon, figurebr, br, unabl, accept, patriarch, rule, convent, explicitli, state, scene, bishop, arriv, flank, soldier, monk, flavia, begin, explicitli, question, societi, find, but, whole, system, subjug, repress, violenc, inevit, bring, tragic, end, around, herbr, br, bill, piec, nunsploit, far, truth, film, depict, consequ, violenc, effect, patriarch, domin, natur, rebellion, corrupt, human, spiritbr, br, describ, titl, piec, postfeminist, end, flavia, triumph, must, alway, corrupt, compromis, pervert, ...]"
13987,"OMG this is one of the worst films iv ever seen and iv seen a lot I'm a Film student. I don't understand why Angelina Jolie would be in this movie? Did she need the money that badly? I love AJ and have seen almost everything shes ever been in so i watched this 2 tick another one off. It was SOO bad! not even good bad, just bad bad. It had 1 or 2 funny little moments but all in all it was bad n a waste of 101 minutes. I cant even say AJ looked good in it because well she didn't. The plot is predictable unless you r expecting a re-telling of Romeo and Juliet then its not. All round disappointing. Maybe if your 12 this could be a good film otherwise I really don't recommend it.","[OMG, one, worst, films, iv, ever, seen, iv, seen, lot, Im, Film, student, dont, understand, Angelina, Jolie, would, movie, need, money, badly, love, AJ, seen, almost, everything, shes, ever, watched, 2, tick, another, one, SOO, bad, even, good, bad, bad, bad, 1, 2, funny, little, moments, bad, n, waste, 101, minutes, cant, even, say, AJ, looked, good, well, didnt, plot, predictable, unless, r, expecting, retelling, Romeo, Juliet, round, disappointing, Maybe, 12, could, good, film, otherwise, really, dont, recommend]",-0.481151,0.235657,-0.481820,-0.315491,"[omg, one, worst, film, iv, ever, seen, iv, seen, lot, im, film, student, dont, understand, angelina, joli, would, movi, need, money, badli, love, aj, seen, almost, everyth, she, ever, watch, 2, tick, anoth, one, soo, bad, even, good, bad, bad, bad, 1, 2, funni, littl, moment, bad, n, wast, 101, minut, cant, even, say, aj, look, good, well, didnt, plot, predict, unless, r, expect, retel, romeo, juliet, round, disappoint, mayb, 12, could, good, film, otherwis, realli, dont, recommend]"
7693,"The Box is a film with great potential, but the makers totally misused that potential. The film seemed to take for ever, because of the boring family dinners and scenes about school and job-dialogs between the action. Those scenes could and must be deleted in my opinion to keep up the tensity and thrill. The philosophy of human free will has potential and seems to referring to the philosophy of Thomas Hobbes (1588-1679), but we find ourselves regretfully struck with magic and nosebleeds, were even Harry Potter would flunked his class with!<br /><br />Probably the best part was that moment when Norma Lewis (Cameron Diaz)has been shot to death, by her loving and caring husband as an act of human free will. I wonder how Hobbes would react if he could...","[Box, film, great, potential, makers, totally, misused, potential, film, seemed, take, ever, boring, family, dinners, scenes, school, jobdialogs, action, scenes, could, must, deleted, opinion, keep, tensity, thrill, philosophy, human, free, potential, seems, referring, philosophy, Thomas, Hobbes, 15881679, find, regretfully, struck, magic, nosebleeds, even, Harry, Potter, would, flunked, class, withbr, br, Probably, best, part, moment, Norma, Lewis, Cameron, Diazhas, shot, death, loving, caring, husband, act, human, free, wonder, Hobbes, would, react, could]",-0.216966,-0.623733,-0.557742,-0.854129,"[box, film, great, potenti, maker, total, misus, potenti, film, seem, take, ever, bore, famili, dinner, scene, school, jobdialog, action, scene, could, must, delet, opinion, keep, tensiti, thrill, philosophi, human, free, potenti, seem, refer, philosophi, thoma, hobb, 15881679, find, regret, struck, magic, noseble, even, harri, potter, would, flunk, class, withbr, br, probabl, best, part, moment, norma, lewi, cameron, diazha, shot, death, love, care, husband, act, human, free, wonder, hobb, would, react, could]"
...,...,...,...,...,...,...,...
27517,"This is one creepy underrated Gem with chilling performances and a fantastic finale!. All the characters are great, and the story was awesome, plus i thought the ending was really cool!. The plot was great, and it never bored me, plus while the child actors were bad, they gave me the creeps!. This happened to be on the space channel a while ago, so i decided to check it out and tape it, i read some good reviews from fellow horror fans, i must say i agree with them, it's very creepy, and suspenseful, plus Strother Martin, was fantastic in his role, as the Satan worshiper. It has tons of creepy atmosphere, and it keeps you guessing throughout, plus all the characters were very likable, and you really start to root for Ben and his family!. It has plenty of disturbing moments, and the film really shocked me at times, plus, it's extremely well made on a low budget!. This is one creepy underrated gem, with chilling performances and a fantastic finale!, i highly recommend this one!. The Direction is very good!. Bernard McEveety does a very good job here, with great camera work, creating a lot of creepy atmosphere, and keeping the film at a very fast pace!. Ther is a little bit of blood and gore. We get a severed leg,lots of bloody corpses,bloody slit throat, slicing and dicing,decapitation, and an impaling. The Acting is excellent!. Strother Martin is fantastic here! as the Satan worshiper, he is extremely creepy, very convincing, was quite chilling, was extremely intense, seemed to be enjoying himself, and just did a fantastic job overall!. Charles Bateman is great as the Dad, he was very caring, very likable, and gave a good show!, i liked him lots. L.Q. Jones is awesome as the Sheriff, he was funny, on top of things, looked very young, had a cool character, and just did an awesome job overall!. Ahna Capri is good as the girlfriend and did what she had to do pretty well. Charles Robinson overacted to the extreme as the Priest and didn't convince me one bit!, and that laugh of his was especially bad. Geri Reischl is actually decent as the daughter, she was somewhat likable, and only got on my nerves a couple times, i rather liked her. Alvy Moore was goofy, but very likable in his role as Tobey i dug him!. Rest of the cast do good. Overall i highly recommend it!. ***1/2 out of 5","[one, creepy, underrated, Gem, chilling, performances, fantastic, finale, characters, great, story, awesome, plus, thought, ending, really, cool, plot, great, never, bored, plus, child, actors, bad, gave, creeps, happened, space, channel, ago, decided, check, tape, read, good, reviews, fellow, horror, fans, must, say, agree, creepy, suspenseful, plus, Strother, Martin, fantastic, role, Satan, worshiper, tons, creepy, atmosphere, keeps, guessing, throughout, plus, characters, likable, really, start, root, Ben, family, plenty, disturbing, moments, film, really, shocked, times, plus, extremely, well, made, low, budget, one, creepy, underrated, gem, chilling, performances, fantastic, finale, highly, recommend, one, Direction, good, Bernard, McEveety, good, job, great, camera, work, creating, ...]",4.142082,1.095047,0.971536,-0.854129,"[one, creepi, underr, gem, chill, perform, fantast, final, charact, great, stori, awesom, plu, thought, end, realli, cool, plot, great, never, bore, plu, child, actor, bad, gave, creep, happen, space, channel, ago, decid, check, tape, read, good, review, fellow, horror, fan, must, say, agre, creepi, suspens, plu, strother, martin, fantast, role, satan, worship, ton, creepi, atmospher, keep, guess, throughout, plu, charact, likabl, realli, start, root, ben, famili, plenti, disturb, moment, film, realli, shock, time, plu, extrem, well, made, low, budget, one, creepi, underr, gem, chill, perform, fantast, final, highli, recommend, one, direct, good, bernard, mceveeti, good, job, great, camera, work, creat, ...]"
28392,"The final chapter in the Hanzo the Razor trilogy provides fitting closure for this entertaining series of samuraisploitation. Inoue replaces Yasuzu Masumura (Blind Beast, Red Angel, Manji) in the director's chair, but the style is pretty much the same, perhaps due to Shintaro Katsu serving as the producer, apart from the titular antihero.<br /><br />Hanzo uncovers a female ghost who is guarding treasure hidden in the bottom of the lake. Of course, Hanzo being Hanzo, he's not put off by the fact she's a ghost, so he proceeds to rape... ahem, interrogate her, using the now familiar revolving net device. The plot takes through a series of blind monks who also doubletime as loansharks, corrupt officials, promiscuous wives and the necessary hack and slash. Hanzo's superior officer, Onishi, and his two servants, provide the typical comedic notes, and generally, it's business as usual.<br /><br />Significantly less convoluted and easier to follow than the first (which is all over the place and a bit of a mess), less stylish, dramatic and bloody than the second (arguably the finest in the Hanzo series), but still entertaining and worthwhile on its own merits. Complete with trademark training sequences, the obligatory rape, swordfights, and a mystery Hanzo is called upon to investigate, this will ultimately satisfy the fans.","[final, chapter, Hanzo, Razor, trilogy, provides, fitting, closure, entertaining, series, samuraisploitation, Inoue, replaces, Yasuzu, Masumura, Blind, Beast, Red, Angel, Manji, directors, chair, style, pretty, much, perhaps, due, Shintaro, Katsu, serving, producer, apart, titular, antiherobr, br, Hanzo, uncovers, female, ghost, guarding, treasure, hidden, bottom, lake, course, Hanzo, Hanzo, hes, put, fact, shes, ghost, proceeds, rape, ahem, interrogate, using, familiar, revolving, net, device, plot, takes, series, blind, monks, also, doubletime, loansharks, corrupt, officials, promiscuous, wives, necessary, hack, slash, Hanzos, superior, officer, Onishi, two, servants, provide, typical, comedic, notes, generally, business, usualbr, br, Significantly, less, convoluted, easier, follow, first, place, bit, mess, less, ...]",0.311404,0.235657,0.049631,-0.315491,"[final, chapter, hanzo, razor, trilog, provid, fit, closur, entertain, seri, samuraisploit, inou, replac, yasuzu, masumura, blind, beast, red, angel, manji, director, chair, style, pretti, much, perhap, due, shintaro, katsu, serv, produc, apart, titular, antiherobr, br, hanzo, uncov, femal, ghost, guard, treasur, hidden, bottom, lake, cours, hanzo, hanzo, he, put, fact, she, ghost, proce, rape, ahem, interrog, use, familiar, revolv, net, devic, plot, take, seri, blind, monk, also, doubletim, loanshark, corrupt, offici, promiscu, wive, necessari, hack, slash, hanzo, superior, offic, onishi, two, servant, provid, typic, comed, note, gener, busi, usualbr, br, significantli, less, convolut, easier, follow, first, place, bit, mess, less, ...]"
5776,"I just saw this movie and all I can say is, where are the drive in's these days. This seems like it would have been a great 2nd feature at a drive in in 1977 (maybe playing with one of those Joan Collins movies), but it's only worth watching now if you're feeling nostalgic for the 70's. Silly plot that is full of holes, but it does remind one of the era it was made in. Interesting to see Melanie Griffith so young and Anne Lockhart is quite attractive, though not much of an actress. In fact, there is not much acting going on in this movie at all. It's sort of a Dukes of Hazzard adventure without a twang or a 1969 Dodge charger jumping over stuff in the Woods. But there is a Mecrury Comet jumping over a garbage dump in this one!","[saw, movie, say, drive, ins, days, seems, like, would, great, 2nd, feature, drive, 1977, maybe, playing, one, Joan, Collins, movies, worth, watching, youre, feeling, nostalgic, 70s, Silly, plot, full, holes, remind, one, era, made, Interesting, see, Melanie, Griffith, young, Anne, Lockhart, quite, attractive, though, much, actress, fact, much, acting, going, movie, sort, Dukes, Hazzard, adventure, without, twang, 1969, Dodge, charger, jumping, stuff, Woods, Mecrury, Comet, jumping, garbage, dump, one]",-0.613243,-0.623733,-0.579433,0.223148,"[saw, movi, say, drive, in, day, seem, like, would, great, 2nd, featur, drive, 1977, mayb, play, one, joan, collin, movi, worth, watch, your, feel, nostalg, 70, silli, plot, full, hole, remind, one, era, made, interest, see, melani, griffith, young, ann, lockhart, quit, attract, though, much, actress, fact, much, act, go, movi, sort, duke, hazzard, adventur, without, twang, 1969, dodg, charger, jump, stuff, wood, mecruri, comet, jump, garbag, dump, one]"
24864,"Cameron Diaz is a woman who is married to a judge, played by Harvey Keitel, whose life is fine until an ex shows up and things get a little complicated.. While I was watching this movie there were several times i asked myself why I was doing so..because the movie is so ridiculous and blah and poorly scripted without any believability. Nor does the audience really car what happens..Even the lovely Cameron can't save this one on a scale of one to ten..2","[Cameron, Diaz, woman, married, judge, played, Harvey, Keitel, whose, life, fine, ex, shows, things, get, little, complicated, watching, movie, several, times, asked, sobecause, movie, ridiculous, blah, poorly, scripted, without, believability, audience, really, car, happensEven, lovely, Cameron, cant, save, one, scale, one, ten2]",-1.009520,-0.623733,-0.872274,-0.854129,"[cameron, diaz, woman, marri, judg, play, harvey, keitel, whose, life, fine, ex, show, thing, get, littl, complic, watch, movi, sever, time, ask, sobecaus, movi, ridicul, blah, poorli, script, without, believ, audienc, realli, car, happenseven, love, cameron, cant, save, one, scale, one, ten2]"


Making Embeddings

When making the embeddings, we choose values of 4 for window and 1 for sg. 1 was chosen for sg to use SkipGram. The data has 40000 reviews, and even though this is sufficient to train a Word2Vec model, rare or domain-specific sentiment words may not be well captured by a CBOW. The window chosen was 4 because the sentiment is often localized in very few words.

In [26]:
reviews = list(X_train['Stem Words'])
model = Word2Vec(
    sentences=reviews,  # Tokenized data
    vector_size=100,             # Embedding dimensionality
    window=4,                    # Context window size
    min_count=2,                 # Minimum frequency for words to be included
    sg=1,                        # Use CBOW (set to 1 for Skip-Gram)
    workers=4,                   # Number of threads for faster training
    epochs=10                    # Number of passes over the data
)

# Save the model for future use
model.save("word2vec_cbow.model")

In [27]:
model.wv['film']

array([-0.0009694 ,  0.19847532, -0.08400735, -0.13118428,  0.40814745,
       -0.17674711,  0.01580076,  0.34279191, -0.2369714 , -0.24268194,
       -0.17975253, -0.53774995, -0.0188964 ,  0.32524678,  0.0413796 ,
       -0.057226  ,  0.13068002,  0.2682956 , -0.20714618, -0.24078478,
        0.15226355, -0.11574178,  0.16071515, -0.17458622,  0.03153089,
       -0.14883302, -0.33633927,  0.2563716 , -0.40459186,  0.08784578,
        0.31675297,  0.04309266, -0.13166523, -0.22508906, -0.12119685,
        0.46627486,  0.39056936, -0.0638577 ,  0.06832714,  0.02203305,
        0.29075542, -0.1774595 , -0.1122286 ,  0.01611961, -0.04348327,
        0.22308953, -0.01685513, -0.41038564,  0.0225801 , -0.09462918,
        0.18233944, -0.05162655, -0.02161762, -0.03642436, -0.09965254,
        0.15291524,  0.21992181, -0.2658345 , -0.18769139,  0.2091243 ,
       -0.01474115, -0.00814192,  0.15608463,  0.10830636, -0.23578224,
        0.21923047,  0.15001787,  0.18081728, -0.20508568,  0.20

In [28]:
model.wv.most_similar('film')

[('movi', 0.8962374329566956),
 ('shortfilm', 0.7833355665206909),
 ('telemovi', 0.760982871055603),
 ('filmit', 0.7597492933273315),
 ('towelhead', 0.7572776079177856),
 ('cassavettess', 0.7563760280609131),
 ('filmbr', 0.7530413269996643),
 ('generalis', 0.748876690864563),
 ('directtodvd', 0.7481429576873779),
 ('swordsandsorceri', 0.7469770908355713)]

'film' and 'movi' (movie) have similar vectors indicating this embedding seems to have worked

In [29]:
def transform_review_to_vector(review, model): #Create 1 vector of the text based on the Word2Vec vectors of the words in the sentence
    word_vectors = []
    for word in review:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    if len(word_vectors) == 0:  # Handle case where no words are in vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

## 4. Creating our FNN

Developping models: Embeddings, Features, Embeddings + Features. The architecture chosen starts at a high number of neurons, and then goes down gradually over the layers, so that it extracts more and more high level features. The data is balanced (same ratio for positives and negatives), so we use accuracy for model selection. The number of neurons for the model with only features was reduced (8 -> 4 -> 2)

In [30]:
X_train.shape

(32000, 7)

In [31]:
def prepare_data(choice, data, cols, model):
    if choice["Embedding"] == 1:
      data["Embedded Data"] = data["Stem Words"].apply(lambda x : transform_review_to_vector(x, model))
      df_embeddings = pd.DataFrame(data['Embedded Data'].tolist(), index=data.index)
    else:
      df_embeddings = pd.DataFrame()

    if choice["Features"] == 1:
      df_features = data[cols]
    else:
      df_features = pd.DataFrame()

    final_df = pd.concat([df_embeddings, df_features], axis=1)
    return final_df.to_numpy()
#Feedforward Neural Network (FNN)
choice_params = [{"Embedding" : 1, "Features" : 0}, {"Embedding" : 0, "Features" : 1}, {"Embedding" : 1, "Features" : 1}]
network_sizes = {(0, 1): (8, 4, 2)} #if we only use the features (and not the embedding), we have a smaller model
results = {}
for option in choice_params:
  X_train_opt = prepare_data(option, X_train, cols, model)
  X_val_opt = prepare_data(option, X_val, cols, model)
  X_test_opt = prepare_data(option, X_test, cols, model)


  # Input size depends on your feature representation (e.g., TF-IDF size or embedding size)
  input_dim = X_train_opt.shape[1]
  tuple_option = (option["Embedding"], option["Features"])
  if tuple_option in network_sizes.keys():
      # Define the FNN
    fnn = Sequential([
      Dense(network_sizes[tuple_option][0], activation='relu', input_shape=(input_dim,)),  # Input layer
      Dropout(0.5),  # Dropout for regularization
      Dense(network_sizes[tuple_option][1], activation='relu'),  # Hidden layer
      Dropout(0.4),
      Dense(network_sizes[tuple_option][2], activation = 'relu'),
      Dropout(0.3),
      Dense(1, activation='sigmoid')  # Output layer for binary classification
      ])
  else:
    fnn = Sequential([
      Dense(128, activation='relu', input_shape=(input_dim,)),  # Input layer
      Dropout(0.5),  # Dropout for regularization
      Dense(64, activation='relu'),  # Hidden layer
      Dropout(0.4),
      Dense(32, activation = 'relu'),
      Dropout(0.3),
      Dense(1, activation='sigmoid')  # Output layer for binary classification
      ])



      # Compile the model
  fnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
  history = fnn.fit(X_train_opt, y_train, validation_data=(X_val_opt, y_val), batch_size = 64, epochs=50, callbacks=[early_stopping])
  last_val_accuracy = history.history['val_accuracy'][-1]
  results[tuple_option] = last_val_accuracy

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.6860 - loss: 0.5611 - val_accuracy: 0.8666 - val_loss: 0.3264
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8457 - loss: 0.3737 - val_accuracy: 0.8677 - val_loss: 0.3172
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8534 - loss: 0.3549 - val_accuracy: 0.8730 - val_loss: 0.3065
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8562 - loss: 0.3483 - val_accuracy: 0.8687 - val_loss: 0.3097
Epoch 5/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8560 - loss: 0.3459 - val_accuracy: 0.8717 - val_loss: 0.3039
Epoch 6/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8575 - loss: 0.3409 - val_accuracy: 0.8749 - val_loss: 0.3036
Epoch 7/50
[1m500/500[0m [32m━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5674 - loss: 0.6787 - val_accuracy: 0.7286 - val_loss: 0.6136
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6005 - loss: 0.6510 - val_accuracy: 0.7344 - val_loss: 0.5927
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6184 - loss: 0.6355 - val_accuracy: 0.7350 - val_loss: 0.5870
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6173 - loss: 0.6340 - val_accuracy: 0.7352 - val_loss: 0.5784
Epoch 5/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6271 - loss: 0.6262 - val_accuracy: 0.7383 - val_loss: 0.5747
Epoch 6/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6322 - loss: 0.6218 - val_accuracy: 0.7356 - val_loss: 0.5728
Epoch 7/50
[1m500/500[0m [32m━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7109 - loss: 0.5527 - val_accuracy: 0.8504 - val_loss: 0.3483
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8375 - loss: 0.3813 - val_accuracy: 0.8687 - val_loss: 0.3190
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8482 - loss: 0.3601 - val_accuracy: 0.8668 - val_loss: 0.3109
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8576 - loss: 0.3459 - val_accuracy: 0.8733 - val_loss: 0.3079
Epoch 5/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8642 - loss: 0.3314 - val_accuracy: 0.8735 - val_loss: 0.3038
Epoch 6/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8589 - loss: 0.3350 - val_accuracy: 0.8742 - val_loss: 0.3036
Epoch 7/50
[1m500/500[0m 

In [32]:
results

{(1, 0): 0.8733749985694885,
 (0, 1): 0.7394999861717224,
 (1, 1): 0.8766250014305115}

(1, 0): The model is trained using only embeddings and achieves an accuracy of approximately 87.34%.

(0, 1): The model uses only the predefined feature columns and achieves an accuracy of approximately 73.95%.

(1, 1): The model combines both embeddings and feature columns and achieves an accuracy of approximately 87.66%.

The difference between the model with only embeddings and the model with embeddings and feature columns is rather small. However, the latter model performs best.

## 5. Results

In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [34]:
y_pred_probs = fnn.predict(X_test_opt)  # Predicted probabilities
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert probabilities to binary labels (0 or 1)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Precision: 0.8743728677503512
Recall: 0.8646556856519151
F1 Score: 0.869487128317701


We achieve a precision of 87.44%, a recall score of 86.47% and an F1 score of 86.95%. These high scores indicate our model is quite strong in predicting the sentiment of reviews based on the review text.