In [1]:
import zipfile
import os

# Path to the zip file
zip_file_path = './archive.zip'
# Path to store the extracted data
extract_dir = './movie_review_dataset/'

# Create the directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'Files extracted to {extract_dir}')


Files extracted to ./movie_review_dataset/


In [2]:
import pandas as pd
csv_file_path = './movie_review_dataset/IMDB Dataset.csv'
# Read csv
df = pd.read_csv(csv_file_path)
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
# Extracting just the reviews
print(df['review'])

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object


In [4]:
# Importing NLTK libraries
import nltk
# Loading punkt library
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akaka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\akaka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
type(df['review'])

pandas.core.series.Series

In [6]:
def tokenize_each(review):
    tokens = nltk.word_tokenize(review)
    return tokens

df['tokens'] = df['review'].apply(tokenize_each)
print(len(df['tokens']))
print(len(df['tokens'][0]))

50000
380


In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
all_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akaka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
import re
def tokenize_with_normalization_steps(review):
    review = re.sub(r'<br /><br />', ' ', review)
    tokens = nltk.word_tokenize(review)

    # Text normalization steps as practiced in "Basic Text Transformation" notebook
    # Making everything lower case
    normalized_tokens = [re.sub(r'\W+', '', token.lower()) for token in tokens]
    # print(len(norrmalized_tokens))
    
    # Remove single-character tokens (mostly punctuation)
    normalized_tokens = [normalized_tokens for normalized_tokens in normalized_tokens if len(normalized_tokens) > 1]
    
    # Remove numbers
    normalized_tokens = [normalized_tokens for normalized_tokens in normalized_tokens if not normalized_tokens.isnumeric()]

    # Remove stopwords
    normalized_tokens = [normalized_tokens for normalized_tokens in normalized_tokens if normalized_tokens not in all_stopwords]

    return normalized_tokens

df['normalized_tokens'] = df['review'].apply(tokenize_with_normalization_steps)
print(len(df['normalized_tokens']))
print(len(df['normalized_tokens'][0]))

50000
167


In [9]:
print(df['tokens'][0])

['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'Oz', 'episode', 'you', "'ll", 'be', 'hooked', '.', 'They', 'are', 'right', ',', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'The', 'first', 'thing', 'that', 'struck', 'me', 'about', 'Oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', ',', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'GO', '.', 'Trust', 'me', ',', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', '.', 'This', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', ',', 'sex', 'or', 'violence', '.', 'Its', 'is', 'hardcore', ',', 'in', 'the', 'classic', 'use', 'of', 'the', 'word.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'It', 'is', 'called', 'OZ', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'Oswald', 'Maximum', 'Security', 'State', 'Penitentary', '.', 

In [10]:
print(df['normalized_tokens'][0])

['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence', 'hardcore', 'classic', 'use', 'word', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'many', 'aryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'scuffles', 'death', 'stares', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'would', 'nt', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'forget'

In [11]:
total_tokens = [token for token_list in df['tokens'] for token in token_list]

print("Total number of word tokens in the database before normalization steps: " + str(len(total_tokens)))

print("Vocabulary size (number of unique words) of the dataset: " + str(len(set(total_tokens))))

Total number of word tokens in the database before normalization steps: 13974186
Vocabulary size (number of unique words) of the dataset: 194756


In [12]:
total_tokens_normalized = [token for token_list in df['normalized_tokens'] for token in token_list]

print("Total number of word tokens in the database after normalization steps: " + str(len(total_tokens_normalized)))

print("Vocabulary size (number of unique words) of the dataset after normalization steps: " + str(len(set(total_tokens_normalized))))

Total number of word tokens in the database after normalization steps: 5910313
Vocabulary size (number of unique words) of the dataset after normalization steps: 136431


In [13]:
from nltk.util import bigrams
from nltk.util import trigrams
from nltk import FreqDist

def get_top_10_bigrams(tokens):
    bi_grams = list(bigrams(tokens))
    bi_gram_freq = FreqDist(bi_grams)
    return bi_gram_freq.most_common(10)

def get_top_10_trigrams(tokens):
    tri_grams = list(trigrams(tokens))
    tri_gram_freq = FreqDist(tri_grams)

    return tri_gram_freq.most_common(10)

In [14]:
positive_reviews = df[df['sentiment'] == 'positive']['normalized_tokens']
negative_reviews = df[df['sentiment'] == 'negative']['normalized_tokens']

top_10_bigrams_pos = get_top_10_bigrams([token for token_list in positive_reviews for token in token_list])
top_10_trigrams_pos = get_top_10_trigrams([token for token_list in positive_reviews for token in token_list])
top_10_bigrams_neg = get_top_10_bigrams([token for token_list in negative_reviews for token in token_list])
top_10_trigrams_neg = get_top_10_trigrams([token for token_list in negative_reviews for token in token_list])


In [15]:
print("Top 10 bigrams in the reviews tagged positive: " + str(top_10_bigrams_pos))
print("\n****************************************************************\n")
print("Top 10 trigrams in the reviews tagged positive: " + str(top_10_trigrams_pos))
print("\n****************************************************************\n")
print("Top 10 bigrams in the reviews tagged negative: " + str(top_10_bigrams_neg))
print("\n****************************************************************\n")
print("Top 10 trigrams in the reviews tagged negative: " + str(top_10_trigrams_neg))
print("\n****************************************************************\n")

Top 10 bigrams in the reviews tagged positive: [(('ca', 'nt'), 2858), (('one', 'best'), 1662), (('nt', 'know'), 1241), (('wo', 'nt'), 1215), (('even', 'though'), 1092), (('ever', 'seen'), 964), (('could', 'nt'), 942), (('first', 'time'), 925), (('new', 'york'), 846), (('nt', 'get'), 815)]

****************************************************************

Top 10 trigrams in the reviews tagged positive: [(('ca', 'nt', 'help'), 222), (('new', 'york', 'city'), 194), (('ca', 'nt', 'wait'), 172), (('world', 'war', 'ii'), 158), (('one', 'best', 'movies'), 144), (('based', 'true', 'story'), 133), (('movie', 'ever', 'seen'), 132), (('ca', 'nt', 'get'), 131), (('one', 'best', 'films'), 131), (('ca', 'nt', 'say'), 126)]

****************************************************************

Top 10 bigrams in the reviews tagged negative: [(('ca', 'nt'), 4172), (('nt', 'even'), 2233), (('could', 'nt'), 2096), (('ever', 'seen'), 1725), (('nt', 'know'), 1675), (('waste', 'time'), 1427), (('special', 'effe

In [16]:
import math

def calc_prob(w1, w2, w3, tokens, vocab_size):
    bi_grams = list(bigrams(tokens))
    bi_gram_freq = FreqDist(bi_grams)
    
    tri_grams = list(trigrams(tokens))
    tri_gram_freq = FreqDist(tri_grams)

    curr_bigram = (w1, w2)
    curr_trigram = (w1, w2, w3)

    # Adding 1 for laplace smoothing
    curr_bigram_freq = bi_gram_freq[curr_bigram] + 1
    curr_trigram_freq = tri_gram_freq[curr_trigram] + 1

    # Probability formula
    basic_probability = curr_trigram_freq/(curr_bigram_freq + vocab_size)
    log_probability = math.log2(basic_probability)
    return (log_probability, basic_probability)
    

In [19]:
# Test cases

trigram_test_1 = ("worst", "movie", "ever")
trigram_test_2 = ("new", "york", "city")
trigram_test_3 = ("hard", "to", "watch")
trigram_test_4 = ("wasted", "my", "time")
trigram_test_5 = ("based", "true", "story")

w1, w2, w3 = trigram_test_1

log_probability, basic_probability = calc_prob(w1, w2, w3, total_tokens_normalized, len(set(total_tokens_normalized)))

print("Basic probability is: " + str(basic_probability))
print("Log probability is: " + str(log_probability))

Basic probability is: 0.003397442440326038
Log probability is: -8.20133517568844


In [20]:
test_array = [trigram_test_1, trigram_test_2, trigram_test_3, trigram_test_4, trigram_test_5]
for test_case in test_array:
    w1, w2, w3 = test_case
    log_probability, basic_probability = calc_prob(w1, w2, w3, total_tokens_normalized, len(set(total_tokens_normalized)))
    print("Basic probability for trigram " + str(test_case) + " is: " + str(basic_probability))
    print("And its log probability is: " + str(log_probability))

Basic probability for trigram ('worst', 'movie', 'ever') is: 0.003397442440326038
And its log probability is: -8.20133517568844
Basic probability for trigram ('new', 'york', 'city') is: 0.001996602146165798
And its log probability is: -8.968237402524753
Basic probability for trigram ('hard', 'to', 'watch') is: 7.32965873108948e-06
And its log probability is: -17.057822541282476
Basic probability for trigram ('wasted', 'my', 'time') is: 7.32965873108948e-06
And its log probability is: -17.057822541282476
Basic probability for trigram ('based', 'true', 'story') is: 0.0012952988700895732
And its log probability is: -9.592499268875036


In [None]:
"""

The number of word tokens in the database. (1 point)
Ans: 
- Total number of word tokens in the database before normalization steps: 13974186
- Total number of word tokens in the database after normalization steps: 5910313

"""

In [None]:
"""

Vocabulary size (number of unique words) of the dataset. (1 point)
Ans:
- Vocabulary size (number of unique words) of the dataset: 194756
- Vocabulary size (number of unique words) of the dataset after normalization steps: 136431

"""

In [None]:
"""
Top ten bigrams and trigrams from positive and negative review sets, including the frequencies. (2 points)
Ans:
 - Top 10 bigrams in the reviews tagged positive: [(('ca', 'nt'), 2858), (('one', 'best'), 1662), (('nt', 'know'), 1241), 
 (('wo', 'nt'), 1215), (('even', 'though'), 1092), (('ever', 'seen'), 964), (('could', 'nt'), 942), 
 (('first', 'time'), 925), (('new', 'york'), 846), (('nt', 'get'), 815)]

****************************************************************

 - Top 10 trigrams in the reviews tagged positive: [(('ca', 'nt', 'help'), 222), (('new', 'york', 'city'), 194), 
 (('ca', 'nt', 'wait'), 172), (('world', 'war', 'ii'), 158), (('one', 'best', 'movies'), 144), 
 (('based', 'true', 'story'), 133), (('movie', 'ever', 'seen'), 132), (('ca', 'nt', 'get'), 131), 
 (('one', 'best', 'films'), 131), (('ca', 'nt', 'say'), 126)]

****************************************************************

 - Top 10 bigrams in the reviews tagged negative: [(('ca', 'nt'), 4172), (('nt', 'even'), 2233), 
 (('could', 'nt'), 2096), (('ever', 'seen'), 1725), (('nt', 'know'), 1675), (('waste', 'time'), 1427), 
 (('special', 'effects'), 1413), (('would', 'nt'), 1348), (('movie', 'nt'), 1266), (('looks', 'like'), 1231)]

****************************************************************

 - Top 10 trigrams in the reviews tagged negative: [(('worst', 'movie', 'ever'), 456), (('movie', 'ever', 'seen'), 392), 
(('nt', 'waste', 'time'), 389), (('ca', 'nt', 'believe'), 369), (('one', 'worst', 'movies'), 309), 
(('worst', 'movies', 'ever'), 280), (('movies', 'ever', 'seen'), 265), (('ca', 'nt', 'even'), 242), 
(('worst', 'film', 'ever'), 199), (('nt', 'make', 'sense'), 199)]

****************************************************************

"""

In [None]:
"""
Given a sequence of three words (w1,w2,w3), would compute the probability of the third word using trigram language model p(w3|w1,w2).
If you're using log-probabilities, use base 2 for computing logs. (5 points)
Ans:
 - calc_prob(w1, w2, w3, tokens, vocab_size)
"""

In [None]:
"""
Five test cases (sequence of three words) showing output from your trigram language model. (2 points)
Ans:
 - Test cases:
    1) trigram_test_1 = ("worst", "movie", "ever")
    2) trigram_test_2 = ("new", "york", "city")
    3) trigram_test_3 = ("hard", "to", "watch")
    4) trigram_test_4 = ("wasted", "my", "time")
    5) trigram_test_5 = ("based", "true", "story")
 - Results:
    1) Basic probability for trigram ('worst', 'movie', 'ever') is: 0.003397442440326038
        And its log probability is: -8.20133517568844
    2) Basic probability for trigram ('new', 'york', 'city') is: 0.001996602146165798
        And its log probability is: -8.968237402524753
    3) Basic probability for trigram ('hard', 'to', 'watch') is: 7.32965873108948e-06
        And its log probability is: -17.057822541282476
    4) Basic probability for trigram ('wasted', 'my', 'time') is: 7.32965873108948e-06
        And its log probability is: -17.057822541282476
    5) Basic probability for trigram ('based', 'true', 'story') is: 0.0012952988700895732
        And its log probability is: -9.592499268875036
"""