In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Convert sentiment labels to binary values
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [5]:
# Split the dataset into training and testing sets
train_df = df[:40000]
test_df = df[40000:]

In [6]:
train_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [41]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
train_features = vectorizer.fit_transform(train_df['review'])
test_features = vectorizer.transform(test_df['review'])

In [42]:
# Train a Naive Bayes classifier on the training data
nb_classifier = MultinomialNB()
nb_classifier.fit(train_features, train_df['sentiment'])

In [43]:
# Evaluate the classifier on the testing data
predictions = nb_classifier.predict(test_features)
accuracy = accuracy_score(test_df['sentiment'], predictions)
print('Accuracy:', accuracy)

Accuracy: 0.8502


In [44]:
# Take a new input review as a string
new_review = "This movie was amazing! The acting was superb."

In [45]:
# Vectorize the new review using the trained vectorizer
new_features = vectorizer.transform([new_review])

In [46]:
# Predict the sentiment of the new review using the trained classifier
new_prediction = nb_classifier.predict(new_features)[0]

In [47]:
# Print the predicted sentiment (1 for positive and 0 for negative)
print('Predicted sentiment:', new_prediction)

Predicted sentiment: 1


Logistic Regression 

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [49]:
# Fit logistic regression model
lr = LogisticRegression()
lr.fit(train_features, train_df['sentiment'])

In [50]:
# Predict on test set
preds = lr.predict(test_features)

In [51]:
# Evaluate the model
acc = accuracy_score(test_df['sentiment'], preds)
print("Accuracy:", acc)

Accuracy: 0.8858


In [52]:
# Take a new input review as a string
new_review = "This movie was bad."

In [53]:
# Vectorize the new review using the trained vectorizer
new_features = vectorizer.transform([new_review])

In [54]:
# Predict the sentiment of the new review using the trained classifier
new_prediction = nb_classifier.predict(new_features)[0]

In [55]:
# Print the predicted sentiment (1 for positive and 0 for negative)
print('Predicted sentiment:', new_prediction)

Predicted sentiment: 0


Decision Trees

In [56]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
# Fit decision tree model
dt = DecisionTreeClassifier()
dt.fit(train_features, train_df['sentiment'])

In [58]:
# Predict on test set
preds = dt.predict(test_features)

In [59]:
# Evaluate the model
acc = accuracy_score(test_df['sentiment'], preds)
print("Accuracy:", acc)

Accuracy: 0.7206


In [60]:
# Take a new input review as a string
new_review = "This movie was perfect."

In [61]:
# Vectorize the new review using the trained vectorizer
new_features = vectorizer.transform([new_review])

In [62]:
# Predict the sentiment of the new review using the trained classifier
new_prediction = nb_classifier.predict(new_features)[0]

In [63]:
# Print the predicted sentiment (1 for positive and 0 for negative)
print('Predicted sentiment:', new_prediction)

Predicted sentiment: 1


Random Forest

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
# Fit random forest model
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_features, train_df['sentiment'])

In [66]:
# Predict on test set
preds = rf.predict(test_features)

In [67]:
# Evaluate the model
acc = accuracy_score(test_df['sentiment'], preds)
print("Accuracy:", acc)

Accuracy: 0.8484


SVM

In [68]:
from sklearn.svm import SVC

In [69]:
# Train SVM model
svm = SVC(kernel='linear', C=1)
svm.fit(train_features, train_df['sentiment'])

In [70]:
# Evaluate SVM model
y_pred = svm.predict(test_features)
acc = accuracy_score(test_df['sentiment'], y_pred)
print("Accuracy:", acc)

Accuracy: 0.8818


RNN

In [71]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [72]:
# Set the hyperparameters
vocab_size = 10000
embedding_dim = 32
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
num_epochs = 10

In [73]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_df['review'])

In [74]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['review'])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [75]:
test_sequences = tokenizer.texts_to_sequences(test_df['review'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [76]:
# Define the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [77]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [78]:
# Train the model
history = model.fit(train_padded, train_df['sentiment'], epochs=num_epochs, 
                    validation_data=(test_padded, test_df['sentiment']), verbose=2)

Epoch 1/10
1250/1250 - 50s - loss: 0.4360 - accuracy: 0.7909 - val_loss: 0.3703 - val_accuracy: 0.8365 - 50s/epoch - 40ms/step
Epoch 2/10
1250/1250 - 17s - loss: 0.3132 - accuracy: 0.8695 - val_loss: 0.3941 - val_accuracy: 0.8237 - 17s/epoch - 13ms/step
Epoch 3/10
1250/1250 - 14s - loss: 0.2652 - accuracy: 0.8914 - val_loss: 0.3857 - val_accuracy: 0.8372 - 14s/epoch - 11ms/step
Epoch 4/10
1250/1250 - 14s - loss: 0.2101 - accuracy: 0.9172 - val_loss: 0.4104 - val_accuracy: 0.8292 - 14s/epoch - 11ms/step
Epoch 5/10
1250/1250 - 13s - loss: 0.1542 - accuracy: 0.9427 - val_loss: 0.5042 - val_accuracy: 0.8246 - 13s/epoch - 10ms/step
Epoch 6/10
1250/1250 - 15s - loss: 0.1077 - accuracy: 0.9617 - val_loss: 0.5604 - val_accuracy: 0.8190 - 15s/epoch - 12ms/step
Epoch 7/10
1250/1250 - 13s - loss: 0.0695 - accuracy: 0.9758 - val_loss: 0.6611 - val_accuracy: 0.8152 - 13s/epoch - 11ms/step
Epoch 8/10
1250/1250 - 13s - loss: 0.0486 - accuracy: 0.9841 - val_loss: 0.7291 - val_accuracy: 0.8091 - 13s/ep

In [79]:
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(test_padded, test_df['sentiment'], verbose=2)
print('Test Accuracy:', test_acc)

313/313 - 1s - loss: 0.9364 - accuracy: 0.8243 - 1s/epoch - 4ms/step
Test Accuracy: 0.8242999911308289


In [80]:
# Make predictions on a new review
new_review = "I really enjoyed this movie! The acting was great."

In [81]:
# Tokenize the review
new_review_seq = tokenizer.texts_to_sequences([new_review])
new_review_data = pad_sequences(new_review_seq, maxlen=max_length)

In [82]:
# Predict the sentiment of the new review
prediction = model.predict(new_review_data)
if prediction > 0.5:
    print("Positive review")
else:
    print("Negative review")

Positive review


BERT

In [7]:
import tensorflow as tf
import numpy as np
import transformers

In [8]:
from tqdm import tqdm

In [9]:
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

In [10]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
df=pd.read_csv("IMDB Dataset.csv")

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [16]:
# Convert sentiment labels to binary values
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [18]:
train = df[:45000]
test = df[45000:]

In [19]:
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [20]:
# But first see BERT tokenizer exmaples and other required stuff!

example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

['in', 'this', 'ka', '##ggle', 'notebook', ',', 'i', 'will', 'do', 'sentiment', 'analysis', 'using', 'bert', 'with', 'hugging', '##face']
[1999, 2023, 10556, 24679, 14960, 1010, 1045, 2097, 2079, 15792, 4106, 2478, 14324, 2007, 17662, 12172]


In [21]:
def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')

In [23]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [24]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

100%|██████████| 45000/45000 [03:23<00:00, 221.26it/s]


In [25]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|██████████| 5000/5000 [00:21<00:00, 229.99it/s]


In [26]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f23416993c0>

In [27]:
pred_sentences = ['worst movie of my life, will never watch movies from this series', 'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']


In [28]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model
tf_outputs = model(tf_batch)                                  
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])


worst movie of my life, will never watch movies from this series :  Negative
Wow, blew my mind, what a movie by Marvel, animation and story is amazing :  Positive


In [29]:
# Evaluate the model on the validation set
loss, accuracy = model.evaluate(validation_data)
print('Validation accuracy:', accuracy)

Validation accuracy: 0.90420001745224
