# Simple Sentiment Analyser

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [3]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)

# Convert labels to binary (0 for negative, 1 for positive)
labels = [0 if label == 'neg' else 1 for label in labels] # Please complete this

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [4]:
labels[0:15]
len(labels)
labels[-15:-1]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [5]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text)
        # Average word vectors to get document vector
        doc_vector = np.mean([token.vector for token in doc], axis=0) # Please complete this
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [03:57<00:00,  6.75it/s]
100%|██████████| 400/400 [00:57<00:00,  6.99it/s]


In [6]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
model = make_pipeline(TruncatedSVD(n_components=73), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7


#More accurate one

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [19]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)

        # Use mean pooling to get sentence embeddings
        # doc_vector = # Please complete this
        # embeddings.append(doc_vector)
        # doc_vector = torch.mean(model_output.last_hidden_state, dim=1)
        # embeddings.append(doc_vector.numpy())
        doc_vector = model_output.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(doc_vector)
    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [53:37<00:00,  2.01s/it]
100%|██████████| 400/400 [12:53<00:00,  1.93s/it]


In [18]:
X_train_embeddings

array([], dtype=float64)

In [20]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
model = make_pipeline(TruncatedSVD(n_components= 77, random_state = 42), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.795


In [21]:
# in a loop with step to 100 check the accuracy of the model for different n_components values
for n in range(100, 760, 10):
    model = make_pipeline(TruncatedSVD(n_components=n, random_state=42), SVC())
    model.fit(X_train_embeddings, y_train)
    predictions = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy for n_components={n}: {accuracy}")

Accuracy for n_components=100: 0.8025
Accuracy for n_components=110: 0.795
Accuracy for n_components=120: 0.7975
Accuracy for n_components=130: 0.8
Accuracy for n_components=140: 0.795
Accuracy for n_components=150: 0.7925
Accuracy for n_components=160: 0.7975
Accuracy for n_components=170: 0.7925
Accuracy for n_components=180: 0.7975
Accuracy for n_components=190: 0.795
Accuracy for n_components=200: 0.7925
Accuracy for n_components=210: 0.7975
Accuracy for n_components=220: 0.795
Accuracy for n_components=230: 0.7925
Accuracy for n_components=240: 0.795
Accuracy for n_components=250: 0.7925
Accuracy for n_components=260: 0.7975
Accuracy for n_components=270: 0.7975
Accuracy for n_components=280: 0.7975
Accuracy for n_components=290: 0.7975
Accuracy for n_components=300: 0.8
Accuracy for n_components=310: 0.8
Accuracy for n_components=320: 0.8
Accuracy for n_components=330: 0.8
Accuracy for n_components=340: 0.8
Accuracy for n_components=350: 0.8
Accuracy for n_components=360: 0.8025
