In [1]:
!pip install nltk transformers scikit-learn pandas --quiet

In [2]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [5]:
df  = pd.read_csv("/content/IMDB_Dataset.csv")

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.count()

Unnamed: 0,0
review,50000
sentiment,50000


In [8]:
analyzer = SentimentIntensityAnalyzer()

In [9]:
def get_vader_score(text):
    vs = analyzer.polarity_scores(text)
    return vs['compound']

In [10]:
def classify_vader_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [11]:
df_small = df.head(2000).copy()

In [12]:
df_small['vader_score'] = df_small['review'].apply(get_vader_score)
df_small['vader_sentiment'] = df_small['vader_score'].apply(classify_vader_sentiment)

In [13]:
print("\nDataFrame with VADER Sentiment Scores and Classifications:")
print(df_small[['review', 'sentiment', 'vader_score', 'vader_sentiment']].head())


DataFrame with VADER Sentiment Scores and Classifications:
                                              review sentiment  vader_score  \
0  One of the other reviewers has mentioned that ...  positive      -0.9951   
1  A wonderful little production. <br /><br />The...  positive       0.9641   
2  I thought this was a wonderful way to spend ti...  positive       0.9605   
3  Basically there's a family where a little boy ...  negative      -0.9213   
4  Petter Mattei's "Love in the Time of Money" is...  positive       0.9744   

  vader_sentiment  
0        negative  
1        positive  
2        positive  
3        negative  
4        positive  


In [14]:
df_small['vader_match'] = (df_small['sentiment'] == df_small['vader_sentiment'])
vader_accuracy = df_small['vader_match'].mean()
print(f"\nPercentage of VADER sentiments matching original sentiments: {vader_accuracy:.2f}%")


Percentage of VADER sentiments matching original sentiments: 0.71%


In [15]:
df_bert = df.head(2000)

In [16]:
df_bert.count()

Unnamed: 0,0
review,2000
sentiment,2000


In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [18]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

In [19]:
print("Generating BERT embeddings for reviews...")
df_small['bert_embeddings'] = df_small['review'].apply(get_bert_embeddings)
print("BERT embeddings generated.")

Generating BERT embeddings for reviews...
BERT embeddings generated.


In [20]:
print("\nSample BERT Embeddings (shape of the first 5 reviews' embeddings):")
for i, embedding in enumerate(df_small['bert_embeddings'].head()):
    print(f"Review {i+1} embedding shape: {embedding.shape}")


Sample BERT Embeddings (shape of the first 5 reviews' embeddings):
Review 1 embedding shape: (768,)
Review 2 embedding shape: (768,)
Review 3 embedding shape: (768,)
Review 4 embedding shape: (768,)
Review 5 embedding shape: (768,)


In [21]:
X = np.array(df_small['bert_embeddings'].tolist())
# Encode sentiment: 'positive' as 1, 'negative' as 0
y = df_small['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
print("\nTraining Logistic Regression model...")
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
print("Logistic Regression model trained.")



Training Logistic Regression model...
Logistic Regression model trained.


In [24]:
y_pred = logistic_model.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"\nLogistic Regression Accuracy: {accuracy:.2f}")
print("\nLogistic Regression Classification Report:")
print(class_report)


Logistic Regression Accuracy: 0.84

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       195
           1       0.84      0.86      0.85       205

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400



In [26]:
df_small['logistic_regression_sentiment'] = ['positive' if pred == 1 else 'negative' for pred in logistic_model.predict(X)]

In [27]:
print("\n--- Comparison of Sentiment Analysis Results ---")
final_results_df = df_small[['review', 'sentiment', 'vader_sentiment', 'logistic_regression_sentiment']].copy()
print(final_results_df.head())


--- Comparison of Sentiment Analysis Results ---
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

  vader_sentiment logistic_regression_sentiment  
0        negative                      positive  
1        positive                      positive  
2        positive                      positive  
3        negative                      negative  
4        positive                      positive  
