In [37]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df = pd.read_csv('IMDB Dataset.csv')
df = df.iloc[0:5000]
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
4995,An interesting slasher film with multiple susp...,negative
4996,i watched this series when it first came out i...,positive
4997,Once again Jet Li brings his charismatic prese...,positive
4998,"I rented this movie, after hearing Chris Gore ...",negative


In [4]:
#lower casing
df['review'] = df['review'].apply(lambda review: review.lower())

#html tag removal
df['review'] = df['review'].apply(lambda review: BeautifulSoup(review, 'html.parser').get_text())

#url removal 
pattern = r'http[s]?://\S+|www\.\S+'
df['review'] = df['review'].apply(lambda review: re.sub(pattern, '', review))

df.head(10)

  df['review'] = df['review'].apply(lambda review: BeautifulSoup(review, 'html.parser').get_text())


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [5]:
# remove punctuations
df['review'] = df['review'].apply(lambda review: review.translate(str.maketrans('', '', string.punctuation)))

#remove stop words
def remove_stopwords(text):
    stopwords_list = set(stopwords.words('english'))
    filtered_words = []
    for word in text.split():
        if(word not in stopwords_list):
            filtered_words.append(word)
    return ' '.join(filtered_words)


df['review'] = df['review'].apply(remove_stopwords)


df.head(10)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
5,probably alltime favorite movie story selfless...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea 70s first a...,negative
8,encouraged positive comments film looking forw...,negative
9,like original gut wrenching laughter like movi...,positive


In [6]:
# tokeization
df['review'] = df['review'].apply(lambda review: word_tokenize(review))
df.head(10)

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive
5,"[probably, alltime, favorite, movie, story, se...",positive
6,"[sure, would, like, see, resurrection, dated, ...",positive
7,"[show, amazing, fresh, innovative, idea, 70s, ...",negative
8,"[encouraged, positive, comments, film, looking...",negative
9,"[like, original, gut, wrenching, laughter, lik...",positive


In [7]:
# Stemming
stemmer = SnowballStemmer('english')

def stem(wordsList):
    stemmedWords = []
    for word in wordsList:
        stemmedWords.append(stemmer.stem(word))
    return stemmedWords

df['review'] = df['review'].apply(stem)

df.head(10)

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",positive
1,"[wonder, littl, product, film, techniqu, unass...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, there, famili, littl, boy, jake, think...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive
5,"[probabl, alltim, favorit, movi, stori, selfle...",positive
6,"[sure, would, like, see, resurrect, date, seah...",positive
7,"[show, amaz, fresh, innov, idea, 70s, first, a...",negative
8,"[encourag, posit, comment, film, look, forward...",negative
9,"[like, origin, gut, wrench, laughter, like, mo...",positive


In [27]:
# corpus and vocabulary building
corpus = []
vocabulary = []
data = []

for reviewList in df['review']:
    corpus.extend(reviewList)
    data.append(reviewList)

vocabulary = set(corpus)
data =  np.array(df['review'].apply(lambda x: x[0])).reshape(-1, 1)

print(len(corpus))
print(len(vocabulary))

599987
40993


In [36]:
# applying One Hot Encoding

encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(data)

print(encoder.categories_)

[array(['1', '10', '13', ..., 'zu', 'zuth', 'zzzzzzzzzzzzzzzzzz'],
      shape=(1663,), dtype=object)]


In [40]:
# bag of words approach
data = []
for l in df['review']:
    sen = ' '.join(l)
    data.append(sen)

vectorizer = CountVectorizer()
encoded_data = vectorizer.fit_transform(data)

vocabulary = vectorizer.get_feature_names_out()
print(encoded_data)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 478500 stored elements and shape (5000, 40889)>
  Coords	Values
  (0, 25635)	1
  (0, 29945)	1
  (0, 22672)	1
  (0, 39164)	3
  (0, 26317)	5
  (0, 11788)	2
  (0, 40636)	1
  (0, 17155)	1
  (0, 30098)	2
  (0, 12135)	1
  (0, 15990)	1
  (0, 22790)	1
  (0, 13288)	2
  (0, 36059)	1
  (0, 34518)	2
  (0, 5385)	1
  (0, 37837)	1
  (0, 31130)	1
  (0, 38738)	4
  (0, 31949)	1
  (0, 40126)	1
  (0, 14940)	1
  (0, 37224)	1
  (0, 32379)	4
  (0, 12528)	1
  :	:
  (4998, 8836)	2
  (4998, 10282)	1
  (4998, 29195)	1
  (4998, 22279)	1
  (4998, 40447)	1
  (4998, 19132)	1
  (4998, 10296)	1
  (4998, 38063)	1
  (4999, 12037)	1
  (4999, 14832)	1
  (4999, 10007)	1
  (4999, 36091)	1
  (4999, 23753)	3
  (4999, 21766)	2
  (4999, 4290)	1
  (4999, 16047)	1
  (4999, 21633)	1
  (4999, 40252)	1
  (4999, 37064)	1
  (4999, 22179)	1
  (4999, 37464)	1
  (4999, 595)	1
  (4999, 8153)	1
  (4999, 2783)	1
  (4999, 22195)	1


In [44]:
# m-gram approach

def apply_ngrams(n):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(data)
    dense_matrix = X.toarray()
    vocabulary = vectorizer.get_feature_names_out()
    word_counts_df = pd.DataFrame(dense_matrix, columns=vocabulary)
    return vocabulary, word_counts_df

bi_vocab, bi_word_counts_df = apply_ngrams(2)

print("Bi-gram Vocabulary:", bi_vocab)
print("Bi-gram Word Counts:")
print(bi_word_counts_df)


Bi-gram Vocabulary: ['007 agent' '007 blue' '007 debut' ... 'ïn america' 'önsjön came'
 'überwoman snobbi']
Bi-gram Word Counts:
      007 agent  007 blue  007 debut  007 fatima  007 formid  007 game  \
0             0         0          0           0           0         0   
1             0         0          0           0           0         0   
2             0         0          0           0           0         0   
3             0         0          0           0           0         0   
4             0         0          0           0           0         0   
...         ...       ...        ...         ...         ...       ...   
4995          0         0          0           0           0         0   
4996          0         0          0           0           0         0   
4997          0         0          0           0           0         0   
4998          0         0          0           0           0         0   
4999          0         0          0           0         