### Assignment 2

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.stem import PorterStemmer
import nltk
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec



nltk.download('stopwords')
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [70]:
file_path = r'IMDB Dataset.csv'
data = pd.read_csv(file_path)
print(data.head(20))


                                               review sentiment
0   One of the other reviewers has mentioned that ...  positive
1   A wonderful little production. <br /><br />The...  positive
2   I thought this was a wonderful way to spend ti...  positive
3   Basically there's a family where a little boy ...  negative
4   Petter Mattei's "Love in the Time of Money" is...  positive
5   Probably my all-time favorite movie, a story o...  positive
6   I sure would like to see a resurrection of a u...  positive
7   This show was an amazing, fresh & innovative i...  negative
8   Encouraged by the positive comments about this...  negative
9   If you like original gut wrenching laughter yo...  positive
10  Phil the Alien is one of those quirky films wh...  negative
11  I saw this movie when I was about 12 when it c...  negative
12  So im not a big fan of Boll's work but then ag...  negative
13  The cast played Shakespeare.<br /><br />Shakes...  negative
14  This a fantastic movie of three pris

In [71]:
X = data['review']

def transform_bin(x):
  if x == "positive":
    return 1
  else:
    return 0
data['sentiment'] = data['sentiment'].apply(lambda x : transform_bin(x))

y = data['sentiment']


In [72]:
# Data split (80% training, 20% validation)
train_data, X_test, train_label, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

X_train = X_train.to_frame()
X_test = X_test.to_frame()
X_val = X_val.to_frame()

# Display the sizes of the datasets
print("Training Data Size:", len(X_train))
print("Validation Data Size:", len(X_val))
print("Test Data Size:", len(X_test))

Training Data Size: 32000
Validation Data Size: 8000
Test Data Size: 10000


##### Text pre-processing

In [61]:
X_train['review']

Unnamed: 0,review
11794,With no fault to the actors (they all put on g...
24925,The first thing I thought when I saw this film...
28578,Post-feminist depiction of cruelty and sadism....
13987,OMG this is one of the worst films iv ever see...
7693,"The Box is a film with great potential, but th..."
...,...
27517,This is one creepy underrated Gem with chillin...
28392,The final chapter in the Hanzo the Razor trilo...
5776,"I just saw this movie and all I can say is, wh..."
24864,Cameron Diaz is a woman who is married to a ju...


In [73]:
def clean_text(text):
    # Remove unusual or non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the text into words for processing
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]


    return filtered_words

X_train['cleaned_review'] = X_train['review'].apply(clean_text)
X_val['cleaned_review']  = X_val['review'].apply(clean_text)
X_test['cleaned_review']  = X_test['review'].apply(clean_text)


# Display the first few cleaned reviews
print("Cleaned Reviews with Stopwords Removed:")
print(X_train['cleaned_review'].head(20))

Cleaned Reviews with Stopwords Removed:
11794    [fault, actors, put, great, performances, over...
24925    [first, thing, thought, saw, films, really, fi...
28578    [Postfeminist, depiction, cruelty, sadismbr, b...
13987    [OMG, one, worst, films, iv, ever, seen, iv, s...
7693     [Box, film, great, potential, makers, totally,...
8561     [fortunate, year, seen, several, films, univer...
35645    [premise, anime, series, bread, things, base, ...
17976    [movie, probably, overall, meditative, quality...
47966    [rule, try, find, much, films, possibly, enjoy...
24453    [Based, little, seen, show, dont, think, ever,...
48518    [Possibly, worst, movie, ever, seen, Pathetic,...
40233    [Ashley, Judd, early, role, think, first, star...
8428     [handful, alternative, titles, English, Sexorc...
41242    [Though, Cher, Cage, focal, points, story, Gar...
10521    [Divorced, single, mom, picturesque, seaside, ...
35760    [opening, scenes, FIERCE, PEOPLE, interplay, t...
8486     [movie,

Further preprocessing: Features and Embeddings

Features

Here we create: Number of Positive lexicon words, Number of Negative lexicon words, Number of nos, and Number of words. After that the values were standardized

In [74]:
positive_lexicon = set(opinion_lexicon.positive())
negative_lexicon = set(opinion_lexicon.negative())

def counts_lex(review, type = 'neg'):
    count_neg = 0
    count_pos = 0
    for word in review:
      if word.lower() in positive_lexicon:
        count_pos += 1
      if word.lower() in negative_lexicon:
        count_neg += 1
    if type == 'neg':
      return count_neg
    elif type == 'pos':
      return count_pos

def count_nos(review):
    count = 0
    for word in review:
        if word == "no" and word == "not":
          count += 1
    return count

X_train['NrPos'] = X_train['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_train['NrNeg'] = X_train['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_train['NrWords'] = X_train['cleaned_review'].apply(lambda x: len(x))
X_train['Nos'] = X_train['cleaned_review'].apply(lambda x : count_nos(x))

X_val['NrPos'] = X_val['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_val['NrNeg'] = X_val['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_val['NrWords'] = X_val['cleaned_review'].apply(lambda x: len(x))
X_val['Nos'] = X_val['cleaned_review'].apply(lambda x : count_nos(x))

X_test['NrPos'] = X_test['cleaned_review'].apply(lambda x : counts_lex(x, type = "pos"))
X_test['NrNeg'] = X_test['cleaned_review'].apply(lambda x : counts_lex(x, type = "neg"))
X_test['NrWords'] = X_test['cleaned_review'].apply(lambda x: len(x))
X_test['Nos'] = X_test['cleaned_review'].apply(lambda x : count_nos(x))

cols = ['NrPos', 'NrNeg', 'NrWords', 'Nos']
train_features_to_transform = X_train[cols]
val_features_to_transform = X_val[cols]
test_features_to_transform = X_test[cols]

scaler = StandardScaler()
X_train_std = scaler.fit_transform(train_features_to_transform)  # X is your feature matrix
X_val_std = scaler.transform(val_features_to_transform)
X_test_std = scaler.transform(test_features_to_transform)

X_train[cols] = X_train_std
X_val[cols] = X_val_std
X_test[cols] = X_test_std

# Apply stemming
stemmer = PorterStemmer()

X_train['Stem Words'] = X_train['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])
X_val['Stem Words'] = X_val['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])
X_test['Stem Words'] = X_test['cleaned_review'].apply(lambda x : [stemmer.stem(word.lower()) for word in x])


# Display the first few cleaned reviews
print("Training Data with cleaned reviews with Stemming and New Features:")
print(X_train['cleaned_review'].head(20))

Training Data with cleaned reviews with Stemming and New Features:
11794    [fault, actors, put, great, performances, over...
24925    [first, thing, thought, saw, films, really, fi...
28578    [Postfeminist, depiction, cruelty, sadismbr, b...
13987    [OMG, one, worst, films, iv, ever, seen, iv, s...
7693     [Box, film, great, potential, makers, totally,...
8561     [fortunate, year, seen, several, films, univer...
35645    [premise, anime, series, bread, things, base, ...
17976    [movie, probably, overall, meditative, quality...
47966    [rule, try, find, much, films, possibly, enjoy...
24453    [Based, little, seen, show, dont, think, ever,...
48518    [Possibly, worst, movie, ever, seen, Pathetic,...
40233    [Ashley, Judd, early, role, think, first, star...
8428     [handful, alternative, titles, English, Sexorc...
41242    [Though, Cher, Cage, focal, points, story, Gar...
10521    [Divorced, single, mom, picturesque, seaside, ...
35760    [opening, scenes, FIERCE, PEOPLE, inter

In [75]:
X_train

Unnamed: 0,review,cleaned_review,NrPos,NrNeg,NrWords,Nos,Stem Words
11794,With no fault to the actors (they all put on g...,"[fault, actors, put, great, performances, over...",-0.349058,-0.255423,-0.481820,0.0,"[fault, actor, put, great, perform, overal, st..."
24925,The first thing I thought when I saw this film...,"[first, thing, thought, saw, films, really, fi...",-0.349058,-0.623733,-0.351669,0.0,"[first, thing, thought, saw, film, realli, fil..."
28578,Post-feminist depiction of cruelty and sadism....,"[Postfeminist, depiction, cruelty, sadismbr, b...",-0.216966,0.726737,0.516006,0.0,"[postfeminist, depict, cruelti, sadismbr, br, ..."
13987,OMG this is one of the worst films iv ever see...,"[OMG, one, worst, films, iv, ever, seen, iv, s...",-0.481151,0.235657,-0.481820,0.0,"[omg, one, worst, film, iv, ever, seen, iv, se..."
7693,"The Box is a film with great potential, but th...","[Box, film, great, potential, makers, totally,...",-0.216966,-0.623733,-0.557742,0.0,"[box, film, great, potenti, maker, total, misu..."
...,...,...,...,...,...,...,...
27517,This is one creepy underrated Gem with chillin...,"[one, creepy, underrated, Gem, chilling, perfo...",4.142082,1.095047,0.971536,0.0,"[one, creepi, underr, gem, chill, perform, fan..."
28392,The final chapter in the Hanzo the Razor trilo...,"[final, chapter, Hanzo, Razor, trilogy, provid...",0.311404,0.235657,0.049631,0.0,"[final, chapter, hanzo, razor, trilog, provid,..."
5776,"I just saw this movie and all I can say is, wh...","[saw, movie, say, drive, ins, days, seems, lik...",-0.613243,-0.623733,-0.579433,0.0,"[saw, movi, say, drive, in, day, seem, like, w..."
24864,Cameron Diaz is a woman who is married to a ju...,"[Cameron, Diaz, woman, married, judge, played,...",-1.009520,-0.623733,-0.872274,0.0,"[cameron, diaz, woman, marri, judg, play, harv..."


Making Embeddings

When making the embeddings, we choose values of 4 for window and 1 for sg. 1 was chosen for sg to use SkipGram. The data has 40000 reviews, and even though this is sufficient to train a Word2Vec model, rare or domain-specific sentiment words may not be well captured by a CBOW. The window chosen was 4 because the sentiment is often localized in very few words.

In [76]:
reviews = list(X_train['Stem Words'])
model = Word2Vec(
    sentences=reviews,  # Tokenized data
    vector_size=100,             # Embedding dimensionality
    window=4,                    # Context window size
    min_count=2,                 # Minimum frequency for words to be included
    sg=1,                        # Use CBOW (set to 1 for Skip-Gram)
    workers=4,                   # Number of threads for faster training
    epochs=10                    # Number of passes over the data
)

# Save the model for future use
model.save("word2vec_cbow.model")


In [66]:
def transform_review_to_vector(review, model):
    word_vectors = []
    for word in review:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    if len(word_vectors) == 0:  # Handle case where no words are in vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

Developping models: Embeddings, Features, Embeddings + Features. The architecture chosen starts at a high number of neurons, and then goes down gradually over the layers, so that it extracts more and more high level features. The data is balanced (same ratio for positives and negatives), so we use accuracy for model selection. The number of neurons for the model with only features was reduced (8 -> 4 -> 2)

In [67]:
X_train

Unnamed: 0,review,cleaned_review,NrPos,NrNeg,NrWords,Nos,Stem Words
11794,With no fault to the actors (they all put on g...,"[fault, actors, put, great, performances, over...",-0.349058,-0.255423,-0.481820,0.0,"[fault, actor, put, great, perform, overal, st..."
24925,The first thing I thought when I saw this film...,"[first, thing, thought, saw, films, really, fi...",-0.349058,-0.623733,-0.351669,0.0,"[first, thing, thought, saw, film, realli, fil..."
28578,Post-feminist depiction of cruelty and sadism....,"[Postfeminist, depiction, cruelty, sadismbr, b...",-0.216966,0.726737,0.516006,0.0,"[postfeminist, depict, cruelti, sadismbr, br, ..."
13987,OMG this is one of the worst films iv ever see...,"[OMG, one, worst, films, iv, ever, seen, iv, s...",-0.481151,0.235657,-0.481820,0.0,"[omg, one, worst, film, iv, ever, seen, iv, se..."
7693,"The Box is a film with great potential, but th...","[Box, film, great, potential, makers, totally,...",-0.216966,-0.623733,-0.557742,0.0,"[box, film, great, potenti, maker, total, misu..."
...,...,...,...,...,...,...,...
27517,This is one creepy underrated Gem with chillin...,"[one, creepy, underrated, Gem, chilling, perfo...",4.142082,1.095047,0.971536,0.0,"[one, creepi, underr, gem, chill, perform, fan..."
28392,The final chapter in the Hanzo the Razor trilo...,"[final, chapter, Hanzo, Razor, trilogy, provid...",0.311404,0.235657,0.049631,0.0,"[final, chapter, hanzo, razor, trilog, provid,..."
5776,"I just saw this movie and all I can say is, wh...","[saw, movie, say, drive, ins, days, seems, lik...",-0.613243,-0.623733,-0.579433,0.0,"[saw, movi, say, drive, in, day, seem, like, w..."
24864,Cameron Diaz is a woman who is married to a ju...,"[Cameron, Diaz, woman, married, judge, played,...",-1.009520,-0.623733,-0.872274,0.0,"[cameron, diaz, woman, marri, judg, play, harv..."


In [77]:
X_train.shape

(32000, 7)

In [83]:
def prepare_data(choice, data, cols, model):
    if choice["Embedding"] == 1:
      data["Embedded Data"] = data["Stem Words"].apply(lambda x : transform_review_to_vector(x, model))
      df_embeddings = pd.DataFrame(data['Embedded Data'].tolist(), index=data.index)
    else:
      df_embeddings = pd.DataFrame()

    if choice["Features"] == 1:
      df_features = data[cols]
    else:
      df_features = pd.DataFrame()

    final_df = pd.concat([df_embeddings, df_features], axis=1)
    return final_df.to_numpy()

choice_params = [{"Embedding" : 1, "Features" : 0}, {"Embedding" : 0, "Features" : 1}, {"Embedding" : 1, "Features" : 1}]
network_sizes = {(0, 1): (8, 4, 2)}
results = {}
for option in choice_params:
  X_train_opt = prepare_data(option, X_train, cols, model)
  X_val_opt = prepare_data(option, X_val, cols, model)
  X_test_opt = prepare_data(option, X_test, cols, model)


  # Input size depends on your feature representation (e.g., TF-IDF size or embedding size)
  input_dim = X_train_opt.shape[1]
  tuple_option = (option["Embedding"], option["Features"])
  if tuple_option in network_sizes.keys():
      # Define the FNN
    fnn = Sequential([
      Dense(network_sizes[tuple_option][0], activation='relu', input_shape=(input_dim,)),  # Input layer
      Dropout(0.5),  # Dropout for regularization
      Dense(network_sizes[tuple_option][1], activation='relu'),  # Hidden layer
      Dropout(0.4),
      Dense(network_sizes[tuple_option][2], activation = 'relu'),
      Dropout(0.3),
      Dense(1, activation='sigmoid')  # Output layer for binary classification
      ])
  else:
    fnn = Sequential([
      Dense(128, activation='relu', input_shape=(input_dim,)),  # Input layer
      Dropout(0.5),  # Dropout for regularization
      Dense(64, activation='relu'),  # Hidden layer
      Dropout(0.4),
      Dense(32, activation = 'relu'),
      Dropout(0.3),
      Dense(1, activation='sigmoid')  # Output layer for binary classification
      ])



      # Compile the model
  fnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
  history = fnn.fit(X_train_opt, y_train, validation_data=(X_val_opt, y_val), batch_size = 64, epochs=50, callbacks=[early_stopping])
  last_val_accuracy = history.history['val_accuracy'][-1]
  results[tuple_option] = last_val_accuracy

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6880 - loss: 0.5669 - val_accuracy: 0.8676 - val_loss: 0.3228
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8452 - loss: 0.3722 - val_accuracy: 0.8711 - val_loss: 0.3100
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8516 - loss: 0.3544 - val_accuracy: 0.8745 - val_loss: 0.3079
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8523 - loss: 0.3513 - val_accuracy: 0.8734 - val_loss: 0.3039
Epoch 5/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8600 - loss: 0.3409 - val_accuracy: 0.8716 - val_loss: 0.3106
Epoch 6/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8583 - loss: 0.3389 - val_accuracy: 0.8745 - val_loss: 0.2995
Epoch 7/50
[1m500/500[0m [32m━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4819 - loss: 0.7447 - val_accuracy: 0.5052 - val_loss: 0.6931
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4960 - loss: 0.6961 - val_accuracy: 0.5051 - val_loss: 0.6931
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4918 - loss: 0.6940 - val_accuracy: 0.4949 - val_loss: 0.6932
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5023 - loss: 0.6934 - val_accuracy: 0.4949 - val_loss: 0.6932
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7124 - loss: 0.5518 - val_accuracy: 0.8534 - val_loss: 0.3436
Epoch 2/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8379 - loss: 0.3843 - val_accuracy: 0.8681 - val_loss: 0.3130
Epoch 3/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8524 - loss: 0.3530 - val_accuracy: 0.8670 - val_loss: 0.3121
Epoch 4/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8533 - loss: 0.3485 - val_accuracy: 0.8719 - val_loss: 0.3062
Epoch 5/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8571 - loss: 0.3424 - val_accuracy: 0.8721 - val_loss: 0.3006
Epoch 6/50
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8630 - loss: 0.3315 - val_accuracy: 0.8754 - val_loss: 0.2981
Epoch 7/50
[1m500/500[0m [32m━━━━━━━

In [85]:
results

{(1, 0): 0.8679999709129333,
 (0, 1): 0.4948750138282776,
 (1, 1): 0.8711249828338623}

In [87]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [88]:
y_pred_probs = fnn.predict(X_test_opt)  # Predicted probabilities
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert probabilities to binary labels (0 or 1)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Precision: 0.8583058732312464
Recall: 0.8787457828934312
F1 Score: 0.8684055697195529
