<a href="https://colab.research.google.com/github/AliMadany/Deep-Learning-In-Computer-Vision-Models/blob/main/Na%C3%AFve_Bayes_Classifier_for_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing


In [None]:
import os
import glob
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# tokenizer package
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!wget -q --show-progress https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz




# Load IMDB dataset from the specified directory.
    
* Returns lists of review texts and corresponding labels (1 = Positive, 0 = Negative).


In [None]:
def load_imdb_data(directory):
    data = []
    labels = []

    for label in ["pos", "neg"]:
        path = os.path.join(directory, label)
        for file in glob.glob(os.path.join(path, "*.txt")):
            with open(file, "r", encoding="utf-8") as f:
                data.append(f.read())
                labels.append(1 if label == "pos" else 0)  # 1 for positive, 0 for negative

    return data, labels


## Load training and test datasets

In [None]:
train_data, train_labels = load_imdb_data("aclImdb/train")
test_data, test_labels = load_imdb_data("aclImdb/test")

print(f"Loaded {len(train_data)} training samples and {len(test_data)} test samples.")


Loaded 25000 training samples and 25000 test samples.


In [None]:
# Converting data into pandas dataframe

train_df = pd.DataFrame({"review": train_data, "label": train_labels})
test_df = pd.DataFrame({"review": test_data, "label": test_labels})

train_df.tail()

Unnamed: 0,review,label
24995,Having the In-Laws over for the weekend? Then ...,0
24996,...when he remade Broadway BILL (1934) as RIDI...,0
24997,I saw this movie years ago in a group traditio...,0
24998,"A strange mix of traditional-80s, smartassy, C...",0
24999,"Val Kilmer, solid performance. Dylan McDermott...",0


# Text tokenization and preproccessing

In [None]:
# additional NLP resources
!pip install --upgrade --no-cache-dir nltk
import nltk
nltk.download("punkt")
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
def tokenize_text(text):
    return word_tokenize(text.lower())  # Tokenizing + lowercasing


train_df["tokens"] = train_df["review"].apply(tokenize_text)
test_df["tokens"] = test_df["review"].apply(tokenize_text)

train_df.tail()


Unnamed: 0,review,label,tokens
24995,Having the In-Laws over for the weekend? Then ...,0,"[having, the, in-laws, over, for, the, weekend..."
24996,...when he remade Broadway BILL (1934) as RIDI...,0,"[..., when, he, remade, broadway, bill, (, 193..."
24997,I saw this movie years ago in a group traditio...,0,"[i, saw, this, movie, years, ago, in, a, group..."
24998,"A strange mix of traditional-80s, smartassy, C...",0,"[a, strange, mix, of, traditional-80s, ,, smar..."
24999,"Val Kilmer, solid performance. Dylan McDermott...",0,"[val, kilmer, ,, solid, performance, ., dylan,..."


# Stopword Removal

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

train_df["filtered_tokens"] = train_df["tokens"].apply(remove_stopwords)
test_df["filtered_tokens"] = test_df["tokens"].apply(remove_stopwords)

train_df.head()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,label,tokens,filtered_tokens
0,The key to The 40-Year-Old Virgin is not merel...,1,"[the, key, to, the, 40-year-old, virgin, is, n...","[key, 40-year-old, virgin, merely, andy, stitz..."
1,I was pretty surprised with this flick. Even t...,1,"[i, was, pretty, surprised, with, this, flick,...","[pretty, surprised, flick, ., even, though, bu..."
2,"Well, some people would say that this particul...",1,"[well, ,, some, people, would, say, that, this...","[well, ,, people, would, say, particular, movi..."
3,"Not wishing to give *anything* away here, I wo...",1,"[not, wishing, to, give, *, anything, *, away,...","[wishing, give, *, anything, *, away, ,, would..."
4,"In a time of magic, barbarians and demons abou...",1,"[in, a, time, of, magic, ,, barbarians, and, d...","[time, magic, ,, barbarians, demons, abound, d..."


# Cleaning Test data

In [None]:
def to_lowercase(tokens):
    return [word.lower() for word in tokens]

train_df["filtered_tokens"] = train_df["filtered_tokens"].apply(to_lowercase)
test_df["filtered_tokens"] = test_df["filtered_tokens"].apply(to_lowercase)

train_df.head()

Unnamed: 0,review,label,tokens,filtered_tokens
0,The key to The 40-Year-Old Virgin is not merel...,1,"[the, key, to, the, 40-year-old, virgin, is, n...","[key, 40-year-old, virgin, merely, andy, stitz..."
1,I was pretty surprised with this flick. Even t...,1,"[i, was, pretty, surprised, with, this, flick,...","[pretty, surprised, flick, ., even, though, bu..."
2,"Well, some people would say that this particul...",1,"[well, ,, some, people, would, say, that, this...","[well, ,, people, would, say, particular, movi..."
3,"Not wishing to give *anything* away here, I wo...",1,"[not, wishing, to, give, *, anything, *, away,...","[wishing, give, *, anything, *, away, ,, would..."
4,"In a time of magic, barbarians and demons abou...",1,"[in, a, time, of, magic, ,, barbarians, and, d...","[time, magic, ,, barbarians, demons, abound, d..."


In [None]:
train_df = train_df[["filtered_tokens", "label"]]
test_df = test_df[["filtered_tokens", "label"]]

train_df.head()

Unnamed: 0,filtered_tokens,label
0,"[key, 40-year-old, virgin, merely, andy, stitz...",1
1,"[pretty, surprised, flick, ., even, though, bu...",1
2,"[well, ,, people, would, say, particular, movi...",1
3,"[wishing, give, *, anything, *, away, ,, would...",1
4,"[time, magic, ,, barbarians, demons, abound, d...",1


In [None]:
def remove_non_alpha(tokens):
    return [re.sub(r'[^a-zA-Z]', '', word) for word in tokens if re.sub(r'[^a-zA-Z]', '', word) != '']

train_df["filtered_tokens"] = train_df["filtered_tokens"].apply(remove_non_alpha)
test_df["filtered_tokens"] = test_df["filtered_tokens"].apply(remove_non_alpha)

In [None]:
print(train_df["filtered_tokens"].head())

0    [key, yearold, virgin, merely, andy, stitzer, ...
1    [pretty, surprised, flick, even, though, budje...
2    [well, people, would, say, particular, movie, ...
3    [wishing, give, anything, away, would, say, te...
4    [time, magic, barbarians, demons, abound, diab...
Name: filtered_tokens, dtype: object


In [None]:
for i in range(10):
    print(f"Sample {i+1}:")
    print("Preprocessed Tokens:", train_df["filtered_tokens"].iloc[i])
    print("Label:", "Positive" if train_df["label"].iloc[i] == 1 else "Negative")
    print("=" * 80)

Sample 1:
Preprocessed Tokens: ['key', 'yearold', 'virgin', 'merely', 'andy', 'stitzer', 'yearold', 'virgin', 'rather', 'manner', 'steve', 'carell', 'presents', 'one', 'genre', 'crass', 'comedy', 'become', 'typified', 'lack', 'humor', 'engaging', 'characters', 'yearold', 'virgin', 'offers', 'colorful', 'cast', 'intelligent', 'heartfelt', 'script', 'nt', 'use', 'protagonist', 'buttend', 'cruel', 'jokes', 'andy', 'still', 'virgin', 'forty', 'years', 'old', 'much', 'joke', 'fact', 'curiosity', 'br', 'br', 'carell', 'veteran', 'team', 'ferrell', 'anchorman', 'exdaily', 'show', 'castmember', 'uses', 'concept', 'film', 'expand', 'character', 'get', 'understand', 'andy', 'way', 's', 'little', 'things', 'make', 'film', 'work', 'andy', 's', 'coworker', 'electronics', 'store', 'asks', 'weekend', 'andy', 'describes', 'failed', 'efforts', 'cooking', 'andy', 'rides', 'bike', 'work', 'signals', 'turns', 'nt', 'adorn', 'home', 'action', 'figures', 'paints', 'talks', 'reveals', 'really', 'old', 'ones'

# Exploring negative reviews

In [None]:
negative_samples = train_df[train_df["label"] == 0]

for i in range(min(5, len(negative_samples))):
    print(f"Negative Sample {i+1}:")
    print("Preprocessed Tokens:", negative_samples["filtered_tokens"].iloc[i])
    print("=" * 80)

Negative Sample 1:
Preprocessed Tokens: ['finish', 'watching', 'movie', 'start', 'regardless', 'bad', 'movie', 'agonizing', 'sit', 'sparkling', 'bullets', 'reporter', 'ninja', 'like', 'moves', 'way', 'bad', 'guys', 'shoot', 'hundreds', 'hundreds', 'bullets', 'seem', 'hit', 'innocent', 'bystanders', 'predictable', 'outcome', 'bad', 'acting', 'horrible', 'like', 'girl', 'finds', 'reporter', 'friends', 'apartment', 'goes', 'heck', 'holding', 'bat', 'hey', 're', 'cute', 'wan', 'na', 'like', 'seconds', 'bad', 'save', 'hour', 'forty', 'minutes', 'go', 'play', 'kids', 'dog']
Negative Sample 2:
Preprocessed Tokens: ['poorly', 'written', 'script', 'likeable', 'characters', 'comedy', 'forgot', 'laugh', 's', 'conceited', 'friends', 'scam', 'get', 'women', 'bed', 'sex', 'scenes', 'another', 'friend', 'semidiscustingly', 'weird', 'sometimes', 'also', 'scams', 'mainly', 'considered', 'guy', 'masterbates', 'friends', 'separately', 'meet', 'fall', 'woman', 'amanda', 'peet', 'somehow', 'done', 'without

# Algorithim

In [None]:
from collections import defaultdict

positive_freq = defaultdict(int)
negative_freq = defaultdict(int)

for tokens, label in zip(train_df["filtered_tokens"], train_df["label"]):
    if label == 1:  # pos review
        for word in tokens:
            positive_freq[word] += 1
    else:  # neg review
        for word in tokens:
            negative_freq[word] += 1

positive_freq = dict(positive_freq)
negative_freq = dict(negative_freq)

print("Top 10 words in Positive Reviews:", sorted(positive_freq.items(), key=lambda x: x[1], reverse=True)[:10])
print("Top 10 words in Negative Reviews:", sorted(negative_freq.items(), key=lambda x: x[1], reverse=True)[:10])


Top 10 words in Positive Reviews: [('br', 49235), ('s', 33247), ('film', 20636), ('movie', 18819), ('nt', 13418), ('one', 13363), ('like', 8808), ('good', 7536), ('story', 6676), ('great', 6375)]
Top 10 words in Negative Reviews: [('br', 52636), ('s', 30992), ('movie', 24580), ('nt', 19968), ('film', 18875), ('one', 12739), ('like', 10988), ('would', 7678), ('even', 7668), ('good', 7277)]


In [None]:
from collections import defaultdict
import pandas as pd

positive_freq = defaultdict(int)
negative_freq = defaultdict(int)

for tokens, label in zip(train_df["filtered_tokens"], train_df["label"]):
    if label == 1:  # pos review
        for word in tokens:
            positive_freq[word] += 1
    else:  # neg review
        for word in tokens:
            negative_freq[word] += 1

positive_freq = dict(positive_freq)
negative_freq = dict(negative_freq)

word_freq_df = pd.DataFrame(list(set(positive_freq.keys()).union(set(negative_freq.keys()))), columns=["word"])
word_freq_df["positive_count"] = word_freq_df["word"].map(positive_freq).fillna(0).astype(int)
word_freq_df["negative_count"] = word_freq_df["word"].map(negative_freq).fillna(0).astype(int)

print("Top 10 words in Positive Reviews:")
print(word_freq_df.sort_values(by="positive_count", ascending=False).head(10))

print("Top 10 words in Negative Reviews:")
print(word_freq_df.sort_values(by="negative_count", ascending=False).head(10))

word_freq_df.head()


Top 10 words in Positive Reviews:
        word  positive_count  negative_count
57194     br           49235           52636
12261      s           33247           30992
56732   film           20636           18875
34455  movie           18819           24580
14000     nt           13418           19968
91990    one           13363           12739
85332   like            8808           10988
33774   good            7536            7277
14494  story            6676            5114
89895  great            6375            2628
Top 10 words in Negative Reviews:
        word  positive_count  negative_count
57194     br           49235           52636
12261      s           33247           30992
34455  movie           18819           24580
14000     nt           13418           19968
56732   film           20636           18875
91990    one           13363           12739
85332   like            8808           10988
75184  would            5716            7678
6880    even            4950    

Unnamed: 0,word,positive_count,negative_count
0,aborigin,1,0
1,showgirls,5,19
2,moorhead,0,1
3,chains,13,18
4,leaks,1,2


# Probability computation for Naive Bayes Classification

In [None]:
total_reviews = len(train_df)
total_positive_reviews = sum(train_df["label"])
total_negative_reviews = total_reviews - total_positive_reviews

P_positive = total_positive_reviews / total_reviews
P_negative = total_negative_reviews / total_reviews

print(f"P(positive) = {P_positive:.4f}")
print(f"P(negative) = {P_negative:.4f}")


P(positive) = 0.5000
P(negative) = 0.5000


In [None]:
total_words_in_positive = sum(positive_freq.values())
total_words_in_negative = sum(negative_freq.values())

vocab_size = len(set(list(positive_freq.keys()) + list(negative_freq.keys())))

word_prob_positive = {word: (positive_freq.get(word, 0) + 1) / (total_words_in_positive + vocab_size) for word in positive_freq}
word_prob_negative = {word: (negative_freq.get(word, 0) + 1) / (total_words_in_negative + vocab_size) for word in negative_freq}

sample_words = ["good", "bad", "amazing", "terrible", "boring"]
for word in sample_words:
    print(f"P({word} | positive) = {word_prob_positive.get(word, 1/vocab_size):.6f}")
    print(f"P({word} | negative) = {word_prob_negative.get(word, 1/vocab_size):.6f}")


P(good | positive) = 0.004408
P(good | negative) = 0.004387
P(bad | positive) = 0.001093
P(bad | negative) = 0.004382
P(amazing | positive) = 0.000618
P(amazing | negative) = 0.000159
P(terrible | positive) = 0.000143
P(terrible | negative) = 0.000837
P(boring | positive) = 0.000195
P(boring | negative) = 0.000887


# Naive Bayes Classification function

In [None]:
import math

def classify_review(review, P_positive, P_negative, word_prob_positive, word_prob_negative, vocab_size, threshold=0.1):
    tokens = remove_non_words(remove_non_alpha(to_lowercase(word_tokenize(review))))

    log_prob_positive = math.log(P_positive)
    log_prob_negative = math.log(P_negative)

    for word in tokens:
        P_w_given_positive = word_prob_positive.get(word, 1 / (total_words_in_positive + vocab_size))
        P_w_given_negative = word_prob_negative.get(word, 1 / (total_words_in_negative + vocab_size))

        log_prob_positive += math.log(P_w_given_positive)
        log_prob_negative += math.log(P_w_given_negative)

    if abs(log_prob_positive - log_prob_negative) < threshold:
        return "Neutral"

    return 1 if log_prob_positive > log_prob_negative else 0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def to_lowercase(words):
    """Converts a list of words to lowercase."""
    return [word.lower() for word in words]

def remove_non_alpha(words):
    """Removes non-alphabetic words from a list."""
    return [word for word in words if word.isalpha()]

def remove_non_words(words):
    """Removes non-word elements like punctuation and numbers."""
    return [word for word in words if re.match(r"^[a-zA-Z]+$", word)]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

sample_reviews = [
    "This movie was amazing! The story was engaging and the acting was superb.",
    "Absolutely terrible! I regret watching it. The worst experience ever.",
    "The film was okay, nothing too special but not too bad either.",
    "Horrible direction and bad acting ruined what could have been a great film."
]

for review in sample_reviews:
    prediction = classify_review(review, P_positive, P_negative, word_prob_positive, word_prob_negative, vocab_size)
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


Review: This movie was amazing! The story was engaging and the acting was superb.
Predicted Sentiment: Positive

Review: Absolutely terrible! I regret watching it. The worst experience ever.
Predicted Sentiment: Negative

Review: The film was okay, nothing too special but not too bad either.
Predicted Sentiment: Negative

Review: Horrible direction and bad acting ruined what could have been a great film.
Predicted Sentiment: Negative



In [None]:
test_df

Unnamed: 0,filtered_tokens,label
0,"[another, good, overcoming, evil, story, diffe...",1
1,"[footprints, certainly, nt, average, run, mill...",1
2,"[film, offers, one, greatest, experiences, ava...",1
3,"[movie, starts, three, people, play, ear, holi...",1
4,"[watched, movie, never, read, book, took, char...",1
...,...,...
24995,"[liked, whole, set, ceasar, s, palace, roman, ...",0
24996,"[combine, good, casting, bad, writing, good, o...",0
24997,"[s, yet, another, movie, dysfunctional, lead, ...",0
24998,"[well, shuck, sideways, nt, seen, home, movie,...",0


# Testing Classifier Accuracy

In [None]:
import random

random_reviews = test_df.sample(n=10, random_state=42)

correct_predictions = 0

for index, row in random_reviews.iterrows():
    review_tokens = row["filtered_tokens"]
    review = " ".join(review_tokens)
    actual_label = row["label"]

    predicted_label = classify_review(review, P_positive, P_negative, word_prob_positive, word_prob_negative, vocab_size)

    sentiment_pred = "Positive" if predicted_label == 1 else "Negative"
    sentiment_actual = "Positive" if actual_label == 1 else "Negative"

    if predicted_label == actual_label:
        correct_predictions += 1

    print(f"Review: {review}\nActual Sentiment: {sentiment_actual}\nPredicted Sentiment: {sentiment_pred}\n")

accuracy = correct_predictions / len(random_reviews)
print(f"Classifier Accuracy: {accuracy:.2%}")



Review: saw film last night say loved every minute taken spirit parody bondesquire films s truly superior true comedy film blatant disregard political correctness misogyny cultural insensitivity almost laughable machoism films genre used major comic effect also calls illogic formulaic elements task agent oss constantly learning difficult things insanely quick arabic play traditional instrument missing pathetically obvious clues lines film left laughing hours movie finished say learned interesting french vocabulary would probably professors quite exasperated use thought film excellent intensely funny first film ve ever seen truly parodies aspects spy film
Actual Sentiment: Positive
Predicted Sentiment: Positive

Review: saw film edinburgh film festival would recommend two half hours long nothing much happens wadingthroughporridge pace br br main characters gormless totally lacking charisma personality noone smiles film neither would lives although domino seems healthy sexual appetite nt