## Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn


## Reading Data

In [2]:
df = pd.read_csv('../data/pre_process/IMDB Dataset.csv')

## Explore The Data

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
df.shape

(50000, 2)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [19]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [20]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [21]:
df.duplicated().sum()

np.int64(418)

In [22]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

## Cleaning the Data

In [4]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

## Preprocessing the Data

**1. LowerCase Text**

In [5]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


**2. Remove HTML Tags**

In [6]:
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

# text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
# remove_html_tags(text)

**3. Remove URLs**

In [7]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

# text = "Visit https://example.com or http://test.com or www.website.org for more info."
# result = remove_url(text)
# print(result)

**4. Remove Punctuations**

In [8]:
import string

punc = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', punc))

# text = "The quick brown fox jumps over the lazy dog. However, the dog doesn't seem impressed! Oh no, it just yawned. How disappointing! Maybe a squirrel would elicit a reaction. Alas, the fox is out of luck."
# remove_punc(text)

**5. Handling ChatWords**

In [9]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [10]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

# text = 'IMHO he is the best'
# print(chat_conversion(text))

**6. Spelling Correction**

In [11]:
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

sym_spell.load_dictionary("../data/Symspell/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)


True

In [12]:
def correct_spell_symspell(text):
    corrected_text = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_text.append(suggestions[0].term if suggestions else word)
    return ' '.join(corrected_text)

In [13]:
# Kiểm tra hàm
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
print(correct_spell_symspell(incorrect_text))

certain conditions during several generations are modified in they same manner


**7. Handling StopWords**

In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopword = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    
    return " ".join(new_text)

In [16]:
text = 'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times'
print(f'Text With Stop Words :{text}')

remove_stopwords(text)

Text With Stop Words :probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. it just never gets old, despite my having seen it some 15 or more times


'probably  all-time favorite movie,  story  selflessness, sacrifice  dedication   noble cause,    preachy  boring.   never gets old, despite   seen   15   times'

**8. Handling Emojies**

In [17]:
import emoji

def handling_emoji(text):
    return emoji.demojize(text)

text = "Loved the movie. It was 😘"
text1 = "Python is 🔥"

print(handling_emoji(text))
print(handling_emoji(text1))

Loved the movie. It was :face_blowing_a_kiss:
Python is :fire:


**9. Handling Acronyms**

In [18]:
import contractions

def handling_acronyms(text):
    return contractions.fix(text)

text = "You'll find that I'll help you if you don't give up."
print(handling_acronyms(text))

You will find that I will help you if you do not give up.


**10. Stemming**

In [19]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

In [20]:
# A single Sentence
st = "walk walks walking walked"
# Calling Function
stem_words(st)

'walk walk walk walk'

In [23]:
def clean_review(review):
    review = remove_html_tags(review)
    review = remove_url(review)
    review = handling_emoji(review)
    review = handling_acronyms(review)
    review = remove_punc(review)
    review = chat_conversion(review)
    review = remove_stopwords(review)
    review = stem_words(review)
    review = correct_spell_symspell(review)
    return review
    
    
df['review'] = df['review'].apply(clean_review)
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch episode hook right ex...,positive
1,wonder little product film technique unassum o...,positive
2,thought wonder way spend tear eye hot summer w...,positive
3,basic family little boy jake think zombie clos...,negative
4,petter matter love tear eye money visual stun ...,positive


In [24]:
from sklearn.preprocessing import LabelEncoder

reviews = df['review'].values
labels = df['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print(encoded_labels)

[1 1 1 ... 0 0 0]


In [29]:
from sklearn.model_selection import train_test_split

train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, encoded_labels, test_size=0.2, stratify = encoded_labels, random_state=18)

## Model

In [37]:
import math
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self, max_features=None):
        self.vocab = set()
        self.logprior = {}
        self.loglikelihood = {}
        self.max_features = max_features
        
        
    def fit(self, D, C):
        """
        Trains a Naive Bayes classifier.
        Args:
        - D: List of tuples (document, label) representing the dataset.
        - C: Set of classes.
        
        After fitting, it updates:
        - self.vocab: Vocabulary of all unique words in D.
        - self.logprior: Dictionary of log P(c) values for each class.
        - self.loglikelihood: Dictionary of log P(w|c) values for each word and class.
        """
        
        Ndoc = len(D)   # Total number of documents
        word_counts = {c: defaultdict(int) for c in C}   # Concatenated words per class
        Nc = {c: 0 for c in C}   # Document count per class
        
        # Count occurrences and polupate vocabulary
        for doc, doc_class in D:
            Nc[doc_class] += 1   # Count documents per class
            words = doc.split()
            for word in words:
                word_counts[doc_class][word] += 1 
                self.vocab.add(word)  # Add words to vocabulary
            
        # Limit vacubulary if max_features is set
        if self.max_features:
            all_words = Counter({word: sum(word_counts[c][word] for c in C) for word in self.vocab})
            self.vocab = set([word for word, _ in all_words.most_common(self.max_features)])
            
        # Calculate P(c) terms (logprior) and P(w|c) terms (loglikelihood) 
        for c in C:
            self.logprior[c] = math.log(Nc[c] / Ndoc)
            
            total_words = sum(word_counts[c][word] + 1 for word in self.vocab)
            self.loglikelihood[c] = {word: math.log((word_counts[c][word] + 1) / total_words) for word in self.vocab}
            
            
    def predict(self, testdoc, C):
        """
        Predicts the class of a given test document.
        Args:
        - testdoc: The test document (string).
        - C: Set of classes.

        Returns:
        - best_class: Predicted class for the test document.
        """
        sum_scores = {c: self.logprior[c] for c in C}   # Initialize with logprior
        
        for word in testdoc.split():
            if word in self.vocab:   # Only consider words in the vocabulary
                for c in C:
                    sum_scores[c] += self.loglikelihood[c].get(word, 0)
                    
        
        # Return the class with the highest score
        best_class = max(sum_scores, key=sum_scores.get)
        return best_class
        

In [38]:
def accuracy_score(true_labels, predicted_labels):
        """
        Calculates the accuracy of predictions.
        
        Args:
        - true_labels: List of true labels.
        - predicted_labels: List of predicted labels.
        
        Returns:
        - Accuracy as a float.
        """
        
        # Calculate the number of correct predictions
        correct_predictions = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == pred)
        
        # Calculate accuracy
        accuracy = correct_predictions / len(true_labels)
        return accuracy

In [39]:
nb_classifier = NaiveBayesClassifier()

nb_classifier.fit(list(zip(train_reviews, train_labels)), set(train_labels))

predicted_labels = [nb_classifier.predict(review, set(train_labels)) for review in test_reviews]

In [41]:
accuracy = accuracy_score(test_labels, predicted_labels)
print(accuracy)

0.8474336997075729


In [43]:
test_reviews

array(['think win bargain contest movie since got part martial art movie classic collect movie buck mean paid like cent chance watch black fist version movie release board basic revenge flick black fist int bad even though obvious hamper low budget one inform rule thumb watch movie lead actor better product screenplay movie automat get least three star certainly case lawson present charisma probably desert better film career got street fight choreography ostend reason film really impress anyone ever spar martial art school even punch schoolyard fight spent two year learn basic king fun even would never fall front stamp kick arm drag roundhouse punch display atmosphere good dust blood shout crowd actor put feel fight scene less believe plot dawson character leroy fisk portray streetsmart sharp young man go look work pickup fighter miller unsent street match yet surprise indian pay cop excuse rain small town iowa even knew watch hard tear eye chart bronson cop paid sort action guy fight 

In [42]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch episode hook right ex...,positive
1,wonder little product film technique unassum o...,positive
2,thought wonder way spend tear eye hot summer w...,positive
3,basic family little boy jake think zombie clos...,negative
4,petter matter love tear eye money visual stun ...,positive


In [44]:
df['review'][0]

'one review mention watch episode hook right exactly happen meth first thing struck brutal flinch scene violence set right word go trust show faint heart timid show pull punch regard drug sex violence hardcore classic use word call nickname given oswald maximum secure state penitentiary focus mainly emerald city expert section prison cell glass front face inward privacy high agenda pm city home manyaryan muslim gangsta latino christian italian irish moreno scuff death stare dog deal shade agreement never far away would say main appeal show due fact go show would dare forget pretty picture paint mainstream audience forget charm forget romance mess around first episode ever saw struck nasty surreal could say read watch develop last got accustom high level graphic violence violence injustice crook guard whole sold nickel inmate whole kill order get away well manner middle class inmate turn prison bitch due lack street skill prison expert watch may become comfort comfort viewingthat get to

In [45]:
nb_classifier.predict('one review mention watch episode hook right exactly happen meth first thing struck brutal flinch scene violence set right word go trust show faint heart timid show pull punch regard drug sex violence hardcore classic use word call nickname given oswald maximum secure state penitentiary focus mainly emerald city expert section prison cell glass front face inward privacy high agenda pm city home manyaryan muslim gangsta latino christian italian irish moreno scuff death stare dog deal shade agreement never far away would say main appeal show due fact go show would dare forget pretty picture paint mainstream audience forget charm forget romance mess around first episode ever saw struck nasty surreal could say read watch develop last got accustom high level graphic violence violence injustice crook guard whole sold nickel inmate whole kill order get away well manner middle class inmate turn prison bitch due lack street skill prison expert watch may become comfort comfort viewingthat get touch darker side', set(train_labels))

np.int64(1)