## Data Collection

In [None]:
import nltk
from nltk.corpus import movie_reviews

In [None]:
nltk.download('movie_reviews')  # Download movie reviews dataset
nltk.download('punkt')  # Required for tokenization later
nltk.download('stopwords')  # Required for stopwords removal
nltk.download('wordnet')  # Required for lemmatization


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load movie reviews dataset
file_ids = movie_reviews.fileids()  # Get all document IDs

# Extract first 5 positive and negative reviews
positive_reviews = [movie_reviews.raw(file_id) for file_id in file_ids if file_id.startswith('pos')][:5]
negative_reviews = [movie_reviews.raw(file_id) for file_id in file_ids if file_id.startswith('neg')][:5]

# Combine them into a dataset
dataset = positive_reviews + negative_reviews
labels = ['positive'] * 5 + ['negative'] * 5  # Assign labels


In [None]:
print(labels)

['positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative']


In [None]:
# Print the first 500 characters of the first review
print("Raw Text Sample:\n")
print(dataset[0][:500])  # Show first 500 characters of the first review


Raw Text Sample:

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject


## 1. Basic Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### 1.1. Tokenization

In [None]:
import nltk
import spacy
from transformers import AutoTokenizer

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Hugging Face tokenizer for subword tokenization
bpe_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Sample text for tokenization
sample_text = dataset[0][:500]  # First 500 characters of first review

## 1️ Word Tokenization using NLTK
word_tokens = nltk.word_tokenize(sample_text)
print(" Word Tokenization:\n", word_tokens[:20])  # Print first 20 tokens

## 2️ Sentence Tokenization using NLTK
sent_tokens = nltk.sent_tokenize(sample_text)
print("\n Sentence Tokenization:\n", sent_tokens[:3])  # Print first 3 sentences

## 3️ Subword Tokenization using WordPiece (BERT)
subword_tokens = bpe_tokenizer.tokenize(sample_text)
print("\n Subword Tokenization:\n", subword_tokens[:20])  # Print first 20 subword tokens


 Word Tokenization:
 ['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'re", 'about', 'superheroes', '(', 'batman', ',', 'superman']

 Sentence Tokenization:
 ["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .", "for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .", 'to say moore and campbell thoroughly researched the subject']

 Subword Tokenization:
 ['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superhero', '##es', '(', 'batman']


In [None]:
print(sample_text)

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject


### 1.2. Stemming vs. Lemmatization

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Example tokens
tokens = nltk.word_tokenize(sample_text)

# Stemming
stemmed_words = [stemmer.stem(word) for word in tokens]
print("\n Stemmed Words:", stemmed_words[:20])  # Print first 20 stemmed words

# Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("\n Lemmatized Words:", lemmatized_words[:20])  # Print first 20 lemmatized words



 Stemmed Words: ['film', 'adapt', 'from', 'comic', 'book', 'have', 'had', 'plenti', 'of', 'success', ',', 'whether', 'they', "'re", 'about', 'superhero', '(', 'batman', ',', 'superman']

 Lemmatized Words: ['film', 'adapted', 'from', 'comic', 'book', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'re", 'about', 'superheroes', '(', 'batman', ',', 'superman']


### 1.3. Removal of stop words, special characters, and punctuations

In [None]:
from nltk.corpus import stopwords

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokens
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\n Filtered Tokens (Stopwords removed):", filtered_tokens[:20])  # First 20 filtered tokens



 Filtered Tokens (Stopwords removed): ['films', 'adapted', 'comic', 'books', 'plenty', 'success', ',', 'whether', "'re", 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'geared', 'toward']


In [None]:
import re

# Remove special characters and punctuation using regex
clean_text = re.sub(r'[^\w\s]', '', sample_text)
print("\n Cleaned Text (No Punctuation):", clean_text[:100])  # Print first 100 characters



 Cleaned Text (No Punctuation): films adapted from comic books have had plenty of success  whether theyre about superheroes  batman 


In [None]:
# Lowercase the text
cleaned_text = clean_text.lower()

# Normalize whitespace
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
print("\n Cleaned Text (Lowercased and Whitespace Normalized):", cleaned_text[:100])  # Print first 100 characters



 Cleaned Text (Lowercased and Whitespace Normalized): films adapted from comic books have had plenty of success whether theyre about superheroes batman su


## 2. Context-Aware Preprocessing:

### 2.1 Sentence segmentation using nltk.sent_tokenize()

In [None]:
# Sentence segmentation using nltk
sent_tokens = nltk.sent_tokenize(sample_text)
print("\n Sentence Segmentation:", sent_tokens[:3])  # Print first 3 sentences



 Sentence Segmentation: ["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .", "for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .", 'to say moore and campbell thoroughly researched the subject']


### 2.2 Handling out-of-vocabulary (OOV)

In [None]:
# Subword tokenization using WordPiece (BERT)
subword_tokens = bpe_tokenizer.tokenize(sample_text)
print("\n Subword Tokenization (WordPiece):", subword_tokens[:20])  # Print first 20 subwords



 Subword Tokenization (WordPiece): ['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superhero', '##es', '(', 'batman']


### 2.3 Named Entity Removal/Replacement

In [None]:
# Named Entity Recognition using spaCy
doc = nlp(sample_text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("\n Named Entities:", entities)

# Replace named entities with a placeholder
anonymized_text = sample_text
for ent in doc.ents:
    anonymized_text = anonymized_text.replace(ent.text, '[REDACTED]')

print("\n Anonymized Text (Named Entities Removed):", anonymized_text[:100])  # Print first 100 characters



 Named Entities: [('alan moore', 'PERSON'), ('eddie campbell', 'PERSON'), ("the mid '80s", 'DATE'), ('12', 'CARDINAL')]

 Anonymized Text (Named Entities Removed): films adapted from comic books have had plenty of success , whether they're about superheroes ( batm


## 3. Text Representation Techniques:

### 3.1. One-hot encoding vs. TF-IDF vectorization.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# One-hot encoding using CountVectorizer
one_hot_vectorizer = CountVectorizer(binary=True)
one_hot_matrix = one_hot_vectorizer.fit_transform([sample_text])
print("\n One-Hot Encoding Matrix:\n", one_hot_matrix.toarray())

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([sample_text])
print("\n TF-IDF Encoding Matrix:\n", tfidf_matrix.toarray())



 One-Hot Encoding Matrix:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]

 TF-IDF Encoding Matrix:
 [[0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.18814417
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.18814417 0.09407209
  0.18814417 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.18814417 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.18814417 0.09407209 0.09407209 0.09407209 0.18814417
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.47036043 0.09407209 0.09407209 0.09407209 0.18814417
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209]]


In [None]:
from gensim.models import Word2Vec

# Tokenize the text into words for Word2Vec
tokens = nltk.word_tokenize(sample_text)

# Train a Word2Vec model (for demonstration, use a small corpus)
model = Word2Vec([tokens], vector_size=50, window=3, min_count=1, workers=4)

# Get the vector for a word
word_vector = model.wv['films']
print("\n Word2Vec Embedding for 'films':", word_vector)



 Word2Vec Embedding for 'films': [ 1.07737314e-02  1.96202751e-02 -1.41233625e-02 -1.15341647e-03
 -1.01341652e-02  1.28713986e-02  1.93710309e-02 -6.50823175e-04
  1.02639096e-02  1.30786821e-02  8.82290676e-03 -1.60772242e-02
  6.22067926e-03 -2.96727149e-03 -4.34686057e-03  1.08161494e-02
 -2.60245777e-03  1.98630840e-02 -1.26340566e-03  5.00693568e-04
 -4.53380123e-03 -1.70178674e-02 -1.95871275e-02  9.91012901e-03
 -1.32186934e-02  5.93876373e-03  5.54669602e-03 -1.39798019e-02
 -6.28188392e-03 -1.16151609e-02 -1.71332825e-02  7.85340017e-05
  1.95950866e-02 -1.33755216e-02 -8.73577408e-03  1.66702867e-02
  1.86194703e-02  1.96353644e-02 -1.97834000e-02  1.01065747e-02
 -1.32795414e-02  1.37526048e-02  1.23024751e-02 -6.75472431e-03
  1.38038322e-02 -1.57125741e-02  9.08832345e-03  9.13464371e-03
  1.18486714e-02  1.17653031e-02]


In [None]:
from gensim.models import FastText

# Train FastText model on the same tokenized text
fasttext_model = FastText(sentences=[tokens], vector_size=50, window=3, min_count=1, sg=1)

# Get the vector for a word
fasttext_word_vector = fasttext_model.wv['test']

# Output the FastText word vector for 'test'
print("\n FastText Embedding for 'test':", fasttext_word_vector)



 FastText Embedding for 'test': [ 0.0026845   0.00407108 -0.00730213 -0.00144425  0.00205896  0.00120375
  0.00349664 -0.00345134  0.00588176 -0.00809247 -0.00343208 -0.00291846
  0.00538609  0.00085953  0.00343121  0.00281992  0.00482496  0.00203102
  0.00488304 -0.00669601 -0.00048246 -0.0008616  -0.00328265  0.00180765
 -0.00228827  0.00096822  0.00112713 -0.00454078 -0.00231561  0.00171805
 -0.00447071  0.00194817  0.00053104  0.00274572  0.00465155  0.00819038
  0.00328887 -0.00212494  0.00147701 -0.00250585 -0.00107412 -0.00719092
 -0.00241617 -0.00178923 -0.00010647  0.00439559  0.00297278 -0.00907217
  0.0088592   0.00524733]


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-04-18 18:17:09--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-18 18:17:09--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-18 18:17:09--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

# Path to the downloaded GloVe file
glove_input_file = '/content/glove.6B.50d.txt'  # Adjust this path
word2vec_output_file = 'glove.6B.50d.w2vformat.txt'

# Convert GloVe to Word2Vec format
glove2word2vec(glove_input_file, word2vec_output_file)


  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 50)

In [None]:
from gensim.models.keyedvectors import KeyedVectors

# Load pre-trained GloVe model (download a GloVe model, e.g., from Stanford)
# Here, we load it as a word2vec format model for compatibility with Gensim
glove_model = KeyedVectors.load_word2vec_format('glove.6B.50d.w2vformat.txt', binary=False)

# Get the vector for a word (example: 'test')
glove_word_vector = glove_model['test']  # Example: Vector for the word 'test'

# Output the GloVe word vector for 'test'
print("\n GloVe Embedding for 'test':", glove_word_vector)



 GloVe Embedding for 'test': [ 0.13175  -0.25517  -0.067915  0.26193  -0.26155   0.23569   0.13077
 -0.011801  1.7659    0.20781   0.26198  -0.16428  -0.84642   0.020094
  0.070176  0.39778   0.15278  -0.20213  -1.6184   -0.54327  -0.17856
  0.53894   0.49868  -0.10171   0.66265  -1.7051    0.057193 -0.32405
 -0.66835   0.26654   2.842     0.26844  -0.59537  -0.5004    1.5199
  0.039641  1.6659    0.99758  -0.5597   -0.70493  -0.0309   -0.28302
 -0.13564   0.6429    0.41491   1.2362    0.76587   0.97798   0.58507
 -0.30176 ]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample dataset for sentiment analysis and spam detection (updated)
sample_text = dataset[0][:500]  # First 500 characters from the dataset

# Tokenization using NLTK, spaCy, and BERT (already done)
# Preprocessing and vectorization with TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Example sample dataset for sentiment analysis and spam detection
texts = ['I love this product!', 'This is the worst purchase I have ever made.', 'It is okay, not bad, but not great either.']
labels = ['positive', 'negative', 'neutral']  # Labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
from nltk.corpus import movie_reviews

# Full movie review dataset
file_ids = movie_reviews.fileids()
texts = [movie_reviews.raw(file_id) for file_id in file_ids]
labels = ['positive' if file_id.startswith('pos') else 'negative' for file_id in file_ids]


## Classification

In [None]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)
y = labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train and evaluate classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred))


 Classification Report for Naive Bayes:

              precision    recall  f1-score   support

    negative       0.79      0.83      0.81       302
    positive       0.82      0.78      0.80       298

    accuracy                           0.81       600
   macro avg       0.81      0.80      0.80       600
weighted avg       0.81      0.81      0.80       600


 Classification Report for SVM:

              precision    recall  f1-score   support

    negative       0.82      0.80      0.81       302
    positive       0.81      0.82      0.81       298

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.81       600
weighted avg       0.81      0.81      0.81       600


 Classification Report for Logistic Regression:

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82       302
    positive       0.82      0.82      0.82       298

    accuracy                           0.82       600
  