# Simple Sentiment Analyser

In [5]:
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
from sklearn.decomposition import TruncatedSVD # function for dimensionality reduction
from sklearn.feature_extraction.text import CountVectorizer # function for converting text data to vectors
from sklearn.pipeline import make_pipeline # function for creating a pipeline
from sklearn.svm import SVC # function for Support Vector Machine (SVM) model
from sklearn.metrics import accuracy_score # function for model evaluation
from nltk.corpus import movie_reviews # sample text data
from tqdm import tqdm # function for showing progress bar
import nltk # NLP toolbox
import spacy # NLP toolbox

In [6]:
nltk.download('movie_reviews') # Download the IMDB Movie Reviews dataset

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories()
                        for fileid in movie_reviews.fileids(category)] # List of tuples (text, label)

# Extract texts and labels
texts, labels = zip(*movie_reviews_data) # Unzip the list of tuples into two lists

# Convert labels to binary (0 for negative, 1 for positive)
labels = np.array([0 if label == 'neg' else 1 for label in labels]) # Convert labels to binary

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
labels[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
len(labels)

2000

In [None]:
labels[-11:-1]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
doc[0].dtype

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'dtype'

In [None]:
# nlp = spacy.load("en_core_web_sm")


text = X_train[0]
doc = nlp(text)
print("doc[0] is:", doc[0])
print("doc[0].has_vector is:", doc[0].has_vector)
print("doc[0].vector is:", doc[0].vector)

doc[0] is: while
doc[0].has_vector is: True
doc[0].vector is: [-1.8903863e-01 -1.0919725e+00  5.9464866e-01 -1.3465793e+00
 -6.4634943e-01 -8.8092428e-01  8.1322885e-01 -2.3720935e-01
  1.6152266e-01  5.5341655e-01  3.9680359e-01 -1.4818543e-01
 -1.3492306e+00 -3.4411907e-01 -2.5211111e-01 -8.5277212e-01
 -9.6737391e-01  1.4454012e+00  1.1829404e+00 -5.7727218e-01
  1.3045775e+00  8.2723874e-01  1.2261688e+00 -8.7188470e-01
  3.6476345e+00  6.2565506e-01  1.8027409e+00  3.5740709e-01
  2.4463528e-01  1.8441626e-01  2.8450093e-01 -6.8652952e-01
  6.7287606e-01 -1.1027163e-01 -3.7379611e-01 -1.5272062e+00
 -1.8876302e-01  7.3501773e-02  1.7454055e-01  6.9636816e-01
  3.3539245e-01  1.2451193e-01 -5.5614018e-01 -9.4686216e-01
  1.7567501e+00 -1.1334559e-01 -4.5120013e-01 -1.8191037e+00
  3.8229850e-01  1.3238903e+00 -7.8471780e-01  6.7317271e-01
  3.9045537e-01 -1.6134107e+00 -4.9348125e-01 -1.6859490e-01
  9.2342418e-01  3.4502125e-01  2.4862409e+00  6.8899655e-01
 -7.4190170e-01  7.3966

In [None]:
text = X_train[0]
print("X_train[0] text is: ", text)
print()
doc = nlp(text)
print("doc is: ", doc)
print()
doc_vector = np.mean([word.vector for word in doc if word.has_vector], axis=0)
print("doc_vector is: ", doc_vector)
print()

X_train[0] text is:  while watching loser , it occurred to me that amy heckerling's true genius as a film-maker is casting . 
in fast times at ridgemont high , she gave us sean penn's jeff spicoli ; in look who's talking , she turned bruce willis into a wise-cracking baby and provided john travolta with is first career revival ; in clueless , she found a star vehicle for the adorableness that is ( or was ) alicia silverstone . 
she seems to understand instinctively how to find performers the audience will like in spite of their flaws . 
unfortunately , she may also be starting to understand that she understands . 
giving appealing actors an appealing script creates likeable movies . 
giving appealing actors a script in which their appeal _is_ the movie makes for unexpectedly awful films like loser . 
naturally , heckerling makes her protagonist an all-around swell guy . 
paul tannek ( jason biggs ) is a small-town boy who gets a scholarship to nyu , then instantly finds himself an isla

In [None]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text) # Process text with spaCy nlp pipeline to get a Doc object
        # Average word vectors to get document vector
        doc_vector = np.mean([word.vector for word in doc if word.has_vector], axis=0)
        embeddings.append(doc_vector) # Append document vector to list
    return np.array(embeddings) # Return document vectors as numpy array

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [04:38<00:00,  5.74it/s]
100%|██████████| 400/400 [00:59<00:00,  6.72it/s]


In [None]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
model = make_pipeline(TruncatedSVD(n_components = 77, random_state = 42), SVC()) # 96 -> n_components
# In linear algebra, the singular value decomposition (SVD) is a factorization of a real or complex
# matrix into a rotation, followed by a rescaling followed by another rotation. It generalizes the
# eigendecomposition of a square normal matrix with an orthonormal eigenbasis to any. matrix.

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6975


In [None]:
print("y_test[:10]:")
print(y_test[:10])
print()
print("predictions[:10]:")
print(predictions[:10])

y_test[:10]:
[1 0 1 0 1 1 0 1 0 1]

predictions[:10]:
[0 1 1 1 0 1 0 0 0 1]


# More accurate one

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD # TruncatedSVD is a dimensionality reduction technique
from sklearn.pipeline import make_pipeline # make_pipeline is a utility function to create a pipeline
from sklearn.svm import SVC # SVC is the support vector classifier
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews # Import the IMDB Movie Reviews dataset
from transformers import AutoTokenizer, AutoModel # Import the AutoTokenizer and AutoModel classes from the transformers library
import torch

In [2]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)

        # Use mean pooling to get sentence embeddings
        # doc_vector = # Please complete this
        doc_vector = model_output.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(doc_vector)

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [42:31<00:00,  1.59s/it]
100%|██████████| 400/400 [10:01<00:00,  1.50s/it]


In [8]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
model = make_pipeline(TruncatedSVD(n_components= 77, random_state = 42), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.795


In [10]:
# in a loop with step to 100 check the accuracy of the model for different n_components values
accuracies = []
for n in range(1, 100):
    model = make_pipeline(TruncatedSVD(n_components=n, random_state=42), SVC())
    model.fit(X_train_embeddings, y_train)
    predictions = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy for n_components={n}: {accuracy}")

Accuracy for n_components=1: 0.5275
Accuracy for n_components=2: 0.6025
Accuracy for n_components=3: 0.6075
Accuracy for n_components=4: 0.6725
Accuracy for n_components=5: 0.6575
Accuracy for n_components=6: 0.6675
Accuracy for n_components=7: 0.6675
Accuracy for n_components=8: 0.6725
Accuracy for n_components=9: 0.6525
Accuracy for n_components=10: 0.6475
Accuracy for n_components=11: 0.64
Accuracy for n_components=12: 0.65
Accuracy for n_components=13: 0.6475
Accuracy for n_components=14: 0.6425
Accuracy for n_components=15: 0.6525
Accuracy for n_components=16: 0.665
Accuracy for n_components=17: 0.665
Accuracy for n_components=18: 0.6775
Accuracy for n_components=19: 0.675
Accuracy for n_components=20: 0.6975
Accuracy for n_components=21: 0.7125
Accuracy for n_components=22: 0.7275
Accuracy for n_components=23: 0.725
Accuracy for n_components=24: 0.7325
Accuracy for n_components=25: 0.7375
Accuracy for n_components=26: 0.7475
Accuracy for n_components=27: 0.74
Accuracy for n_compo

In [11]:
X_test_embeddings.shape

(400, 768)

In [None]:
# in a loop with step to 100 check the accuracy of the model for different n_components values
for n in range(100, 760, 10):
    model = make_pipeline(TruncatedSVD(n_components=n, random_state=42), SVC())
    model.fit(X_train_embeddings, y_train)
    predictions = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy for n_components={n}: {accuracy}")

Accuracy for n_components=100: 0.8025
Accuracy for n_components=110: 0.795
Accuracy for n_components=120: 0.7975
Accuracy for n_components=130: 0.8
Accuracy for n_components=140: 0.795
Accuracy for n_components=150: 0.7925
Accuracy for n_components=160: 0.7975
Accuracy for n_components=170: 0.7925
Accuracy for n_components=180: 0.7975
Accuracy for n_components=190: 0.795
Accuracy for n_components=200: 0.7925
Accuracy for n_components=210: 0.7975
Accuracy for n_components=220: 0.795
Accuracy for n_components=230: 0.7925
Accuracy for n_components=240: 0.795
Accuracy for n_components=250: 0.7925
Accuracy for n_components=260: 0.7975
Accuracy for n_components=270: 0.7975
Accuracy for n_components=280: 0.7975
Accuracy for n_components=290: 0.7975
Accuracy for n_components=300: 0.8
Accuracy for n_components=310: 0.8
Accuracy for n_components=320: 0.8
Accuracy for n_components=330: 0.8
Accuracy for n_components=340: 0.8
Accuracy for n_components=350: 0.8
Accuracy for n_components=360: 0.8025


In [13]:
# in a loop with step to 100 check the accuracy of the model for different n_components values
for n in range(100, 760, 10):
    model = make_pipeline(TruncatedSVD(n_components=n, random_state=42), SVC())
    model.fit(X_train_embeddings, y_train)
    predictions = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy for n_components={n}: {accuracy}")

Accuracy for n_components=100: 0.8025
Accuracy for n_components=110: 0.795
Accuracy for n_components=120: 0.7975
Accuracy for n_components=130: 0.8
Accuracy for n_components=140: 0.795
Accuracy for n_components=150: 0.7925
Accuracy for n_components=160: 0.7975
Accuracy for n_components=170: 0.7925
Accuracy for n_components=180: 0.7975
Accuracy for n_components=190: 0.795
Accuracy for n_components=200: 0.7925
Accuracy for n_components=210: 0.7975
Accuracy for n_components=220: 0.795
Accuracy for n_components=230: 0.7925
Accuracy for n_components=240: 0.795
Accuracy for n_components=250: 0.7925
Accuracy for n_components=260: 0.7975
Accuracy for n_components=270: 0.7975
Accuracy for n_components=280: 0.7975
Accuracy for n_components=290: 0.7975
Accuracy for n_components=300: 0.8
Accuracy for n_components=310: 0.8
Accuracy for n_components=320: 0.8
Accuracy for n_components=330: 0.8
Accuracy for n_components=340: 0.8
Accuracy for n_components=350: 0.8
Accuracy for n_components=360: 0.8025


In [14]:
# in a loop with step to 100 check the accuracy of the model for different n_components values
for n in range(360, 401, 1):
    model = make_pipeline(TruncatedSVD(n_components=n, random_state=42), SVC())
    model.fit(X_train_embeddings, y_train)
    predictions = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy for n_components={n}: {accuracy}")

Accuracy for n_components=360: 0.8025
Accuracy for n_components=361: 0.8
Accuracy for n_components=362: 0.8
Accuracy for n_components=363: 0.8
Accuracy for n_components=364: 0.8
Accuracy for n_components=365: 0.8025
Accuracy for n_components=366: 0.8
Accuracy for n_components=367: 0.8
Accuracy for n_components=368: 0.8025
Accuracy for n_components=369: 0.8025
Accuracy for n_components=370: 0.8
Accuracy for n_components=371: 0.8
Accuracy for n_components=372: 0.8025
Accuracy for n_components=373: 0.8
Accuracy for n_components=374: 0.8025
Accuracy for n_components=375: 0.8
Accuracy for n_components=376: 0.8025
Accuracy for n_components=377: 0.8025
Accuracy for n_components=378: 0.8
Accuracy for n_components=379: 0.8
Accuracy for n_components=380: 0.8025
Accuracy for n_components=381: 0.8025
Accuracy for n_components=382: 0.8025
Accuracy for n_components=383: 0.8025
Accuracy for n_components=384: 0.8025
Accuracy for n_components=385: 0.8025
Accuracy for n_components=386: 0.8
Accuracy for 

In [19]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
model = make_pipeline(TruncatedSVD(n_components= 91, random_state = 42), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Best Accuracy found: {accuracy}")

Best Accuracy found: 0.8075
