In [1]:
import pandas as pd
import numpy as np

In [2]:
import sqlite3

def load_imdb_dataframe(db_path: str = 'imdb_reviews.db', 
                       limit: int = None,
                       random_sample: bool = False) -> pd.DataFrame:
    """
    Load IMDB reviews from SQLite database into a pandas DataFrame.
    """
    conn = sqlite3.connect(db_path)
    
    if limit is None:
        query = "SELECT * FROM imdb_reviews"
    else:
        if random_sample:
            query = f"SELECT * FROM imdb_reviews ORDER BY RANDOM() LIMIT {limit}"
        else:
            query = f"SELECT * FROM imdb_reviews LIMIT {limit}"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df


In [3]:
df = load_imdb_dataframe()
df.head()

Unnamed: 0,id,review_text,sentiment
0,1,One of the other reviewers has mentioned that ...,positive
1,2,A wonderful little production. <br /><br />The...,positive
2,3,I thought this was a wonderful way to spend ti...,positive
3,4,Basically there's a family where a little boy ...,negative
4,5,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df = df.rename(columns={'review_text': 'review'})
df.columns

Index(['id', 'review', 'sentiment'], dtype='object')

In [5]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         50000 non-null  int64 
 1   review     50000 non-null  object
 2   sentiment  50000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


### TEXT CLEANING

In [7]:
#Removing HTML Tags
import re
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [8]:
df['review']=df['review'].apply(clean_html)

In [9]:
#Converting to lowercase
def convert_lower(text):
    return text.lower()

In [10]:
df['review']=df['review'].apply(convert_lower)

In [11]:
#df['review'].value_counts()

In [12]:
#Removing Special Characters
def remove_special(text):
    x = ''
    for i in text:
        if i.isalnum():  # If the character is alphanumeric
            x = x + i  # Add the character to the result string
        else:
            x = x + ' '  # Replace special characters with a space
    return x


In [13]:
df['review']=df['review'].apply(remove_special)

In [14]:
# Remove stop words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
def remove_stopwords(text):
    # Tokenize the input text
    words = word_tokenize(text)
    
    # Get the set of stop words for English
    stop_words = set(stopwords.words('english'))
    
    # Filter out stop words from the tokenized words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string and return
    return ' '.join(filtered_words)

In [16]:
df['review']=df['review'].apply(remove_stopwords)

In [17]:
df.sample(10)

Unnamed: 0,id,review,sentiment
13804,13805,keenan ivory wayans probably one worst directo...,negative
48616,48617,movie really deserves mst3k treatment pseudo a...,negative
15789,15790,lot already said movie like join praised highl...,positive
32495,32496,rented video piano teacher knowing nothing wri...,negative
11572,11573,film starts promise interaction spanky buckwhe...,positive
16722,16723,entertaining touching version classic tale als...,positive
39877,39878,plot synopsis los angeles future crime kept co...,negative
43870,43871,dev anand prashant zeenat aman jasbir janice s...,positive
43478,43479,director lead actor dutcher revels look film w...,negative
25262,25263,child like puppeteer public access children sh...,negative


In [18]:
from nltk.stem.porter import PorterStemmer

In [19]:
def perform_stemming(text):
    # Tokenize the input text into words
    words = word_tokenize(text)
    
    # Initialize the PorterStemmer
    stemmer = PorterStemmer()
    
    # Stem each word in the list
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    return ' '.join(stemmed_words)


In [20]:
df['review']=df['review'].apply(perform_stemming)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer( max_features=1400)

In [22]:
X = cv.fit_transform(df['review']).toarray()

In [23]:
X.shape

(50000, 1400)

In [24]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(50000, 1400))

In [25]:
y=df.iloc[:,-1].values

In [26]:
y.shape

(50000,)

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test= train_test_split(X,y, test_size=0.2)

In [28]:
X_train.shape

(40000, 1400)

In [29]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [30]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3= BernoulliNB()

In [31]:
clf1.fit(X_train,y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)

In [32]:
y_pred1= clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3= clf3.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
print("Gaussian",accuracy_score(y_test,y_pred1))
print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernoulli",accuracy_score(y_test,y_pred2))

Gaussian 0.7676
Multinomial 0.8364
Bernoulli 0.8364


In [35]:
df

Unnamed: 0,id,review,sentiment
0,1,one review mention watch 1 oz episod hook righ...,positive
1,2,wonder littl product film techniqu unassum old...,positive
2,3,thought wonder way spend time hot summer weeke...,positive
3,4,basic famili littl boy jake think zombi closet...,negative
4,5,petter mattei love time money visual stun film...,positive
...,...,...,...
49995,49996,thought movi right good job creativ origin fir...,positive
49996,49997,bad plot bad dialogu bad act idiot direct anno...,negative
49997,49998,cathol taught parochi elementari school nun ta...,negative
49998,49999,go disagre previou comment side maltin one sec...,negative


In [36]:
# import pickle

# with open('df.pkl', 'wb') as file:
#     pickle.dump(df, file)

# print("Data has been pickled and saved to 'df.pkl'")


In [37]:
import pickle

# Save both model and vectorizer together
with open('model.pkl', 'wb') as file:
    pickle.dump((clf2, cv), file)  # Save as a tuple containing both objects

print("Model and vectorizer have been saved to 'model.pkl'")

Model and vectorizer have been saved to 'model.pkl'


In [38]:
# Optional: Test loading
with open('model.pkl', 'rb') as file:
    loaded_model, loaded_vectorizer = pickle.load(file)
print("Test load successful!")

# Verify the loaded objects
print(f"Vectorizer vocabulary size: {len(loaded_vectorizer.vocabulary_)}")
print(f"Model classes: {loaded_model.classes_}")

Test load successful!
Vectorizer vocabulary size: 1400
Model classes: ['negative' 'positive']
