# Bag of words approach

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
# Load the datasets
X_test = pd.read_csv("data/Train_Test_splits/X_test_50proc_orig.csv")
X_train = pd.read_csv("data/Train_Test_splits/X_train_50proc_orig.csv")
y_test = pd.read_csv("data/Train_Test_splits/y_test_50proc.csv")
y_train = pd.read_csv("data/Train_Test_splits/y_train_50proc.csv")

y_test['sentiment'] = y_test['sentiment'].apply(lambda x: 1 if x == 'LABEL_1' else 0)
y_train['sentiment'] = y_train['sentiment'].apply(lambda x: 1 if x == 'LABEL_1' else 0)

In [23]:
len(X_train), len(X_test), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [36]:
X_train['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's  ' Love in the Time of Money '...
                               ...                        
24995    This movie was a real torture fest to sit thro...
24996    John Wayne & Albert Dekker compete for oil rig...
24997    Tarantino once remarked on a melodrama from th...
24998    Aah yes the workout show was a great. Not only...
24999    This film should have never been made. Honestl...
Name: review, Length: 25000, dtype: object

unigram bag of words approach:

In [37]:
# Create a tokenizer with a fixed vocabulary size
vocab_size = 10000  # Limit vocabulary size to 10,000 most common words
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")  # <OOV> for out-of-vocabulary words
tokenizer.fit_on_texts(X_train['review'])

In [42]:
# Convert reviews to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train['review'])
X_test_seq = tokenizer.texts_to_sequences(X_test['review'])

In [43]:
len(X_train_seq), len(X_test_seq)

(25000, 25000)

In [47]:
# Pad sequences to the same length (max length = 1000 for this dataset)
max_length = 1000
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [48]:
len(X_train_pad), len(X_test_pad)

(25000, 25000)

## RNN

In [49]:
# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')  # Binary classification: output is 0 or 1
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

In [50]:
# Train the model
history = model.fit(
    X_train_pad, y_train['sentiment'],
    epochs=5,
    batch_size=64,
    validation_data=(X_test_pad, y_test['sentiment'])
)


Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 1s/step - accuracy: 0.4976 - loss: 0.6936 - val_accuracy: 0.5010 - val_loss: 0.6932
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 1s/step - accuracy: 0.5045 - loss: 0.6932 - val_accuracy: 0.5010 - val_loss: 0.6957
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 1s/step - accuracy: 0.5051 - loss: 0.6934 - val_accuracy: 0.4991 - val_loss: 0.6933
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 1s/step - accuracy: 0.5040 - loss: 0.6917 - val_accuracy: 0.4990 - val_loss: 0.6943
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 1s/step - accuracy: 0.5023 - loss: 0.6906 - val_accuracy: 0.4990 - val_loss: 0.6947


In [51]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 166ms/step - accuracy: 0.4924 - loss: 0.6950
Test Accuracy: 0.50


In [52]:
model.save('model.h5')

