In [4]:
# download nltk to do the language processing jobs for us
import sys
#!{sys.executable} -m pip install nltk
#!{sys.executable} -m pip install pandas

# import pandas to have our data in a table and read it from the csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown
import csv
nltk.download('stopwords')
nltk.download('punkt') # this is a tokenizer tool we need
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eoin0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eoin0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\eoin0\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
disney_data = pd.read_csv("disneylandReviews.csv")

In [6]:
disney_data.head()

Unnamed: 0,Review_Text
0,If you've ever been to Disneyland anywhere you...
1,Its been a while since d last time we visit HK...
2,Thanks God it wasn t too hot or too humid wh...
3,HK Disneyland is a great compact park. Unfortu...
4,"the location is not in the city, took around 1..."


In [7]:
#selecting the first review
example_review = disney_data["Review_Text"][0]
print("Review Raw Text: %s \n" % example_review)

#tokenize the text
tokens = nltk.word_tokenize(example_review)
print("Tokenized review text: %s" % tokens)

Review Raw Text: If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.  

Tokenized review text: ['If', 'you', "'ve", 'ever', 'been', 'to', 'Disneyland', 'anywhere', 'you', "'ll", 'find', 'Disneyland', 'Hong', 'Kong', 'very', 'similar', 'in', 'the', 'layout', 'when', 'you', 'walk', 'into', 'main', 'street', '!', 'It', 'has', 'a', 'very', 'familiar', 'feel', '.', 'One', 'of', 'the', 'rides', 'its', 'a', 'Small', 'World', 'is', 'absolutely', 'fabulous', 'and', 'worth', 'doing', '.', 'The', 'day', 'we', 'visited', 'was', 'fairly', 'hot', 'and', 'relatively', 'busy', 'but', 'the', 'queues', 'moved', 'fairly', 'well', '.']


In [8]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
filtered_sentence = [w for w in tokens if not w.lower() in stop_words]

# also lets remove duplicates
filtered_sentence = set(filtered_sentence)
print(filtered_sentence)

{'walk', '!', 'Kong', 'Hong', 'layout', 'street', 'anywhere', 'hot', 'queues', 'ever', 'fabulous', 'rides', 'feel', 'relatively', 'well', 'Disneyland', 'main', 'similar', 'find', 'World', 'Small', 'day', '.', "'ve", "'ll", 'fairly', 'busy', 'moved', 'visited', 'absolutely', 'familiar', 'One', 'worth'}


In [10]:
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'

nouns_and_adjectives = [word for (word, pos) in nltk.pos_tag(filtered_sentence) if is_noun(pos)] 
print(nouns_and_adjectives)

['walk', 'Kong', 'Hong', 'layout', 'street', 'hot', 'queues', 'fabulous', 'rides', 'Disneyland', 'main', 'similar', 'find', 'World', 'Small', 'day', 'busy', 'visited', 'familiar', 'worth']


In [11]:
# this will create a dictionary of each term and the corresponding list of documents
inverted_index = {index: [1] for index in nouns_and_adjectives}
print(inverted_index)

{'walk': [1], 'Kong': [1], 'Hong': [1], 'layout': [1], 'street': [1], 'hot': [1], 'queues': [1], 'fabulous': [1], 'rides': [1], 'Disneyland': [1], 'main': [1], 'similar': [1], 'find': [1], 'World': [1], 'Small': [1], 'day': [1], 'busy': [1], 'visited': [1], 'familiar': [1], 'worth': [1]}


In [12]:
# get the list of stop words
stop_words = stopwords.words('english')
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
counter=0
inverted_index={}
for review in disney_data["Review_Text"] : 
    # tokenize the individual review
    tokens = nltk.word_tokenize(review)
    
    # remove the stop words
    filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
    
    # also lets remove duplicates
    filtered_sentence = set(filtered_sentence)
    
   
    nouns_and_adjectives = [word for (word, pos) in nltk.pos_tag(filtered_sentence) if is_noun(pos)] 
    
    # this will create a dictionary of each term and the corresponding list of documents
    for index in nouns_and_adjectives:
        try:
            inverted_index[index].append(counter)
        except: 
            inverted_index[index] = [counter]
    
    counter+=1

In [13]:
print(inverted_index)

{'walk': [0, 15, 34, 106, 160], 'Kong': [0, 4, 5, 21, 22, 23, 26, 27, 32, 33, 34, 35, 37, 44, 49, 51, 57, 58, 68, 71, 74, 77, 82, 89, 90, 93, 96, 98, 103, 120, 128, 139, 142, 143, 144, 148, 151, 156, 163, 164, 166, 172, 175, 177, 181, 189, 191, 199], 'Hong': [0, 4, 5, 21, 22, 23, 26, 27, 32, 33, 34, 35, 36, 37, 44, 49, 51, 57, 58, 68, 71, 74, 77, 78, 82, 89, 90, 93, 96, 98, 120, 128, 139, 142, 143, 144, 148, 151, 156, 163, 164, 166, 172, 175, 177, 181, 189, 191, 199], 'layout': [0], 'street': [0, 16, 37, 46, 90, 96, 126, 148, 171], 'hot': [0, 2, 4, 12, 29, 138, 198], 'queues': [0, 42, 83, 87, 90, 93, 98, 105, 114, 129, 144, 148, 157], 'fabulous': [0, 83, 148], 'rides': [0, 2, 5, 8, 14, 15, 21, 26, 27, 28, 29, 32, 34, 35, 37, 40, 45, 46, 48, 51, 54, 62, 70, 71, 72, 73, 74, 76, 77, 80, 81, 83, 85, 86, 87, 88, 92, 98, 104, 105, 106, 110, 114, 118, 122, 124, 125, 131, 132, 137, 143, 144, 149, 150, 155, 157, 158, 160, 161, 163, 166, 167, 180, 181, 182, 183, 187, 193, 196], 'Disneyland': [0,

In [14]:
with open('inverted_index.csv', 'w') as f:
    for key in inverted_index.keys():
        f.write("%s, %s\n" % (key,inverted_index[key]))

In [15]:
#review_content
movie_data = pd.read_csv("rotten_tomatoes_critic_reviews.csv")

In [16]:
movie_data.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [17]:
#selecting the first review
example_review = movie_data["review_content"][0]
print("Review Raw Text: %s \n" % example_review)

#tokenize the text
tokens = nltk.word_tokenize(example_review)
print("Tokenized review text: %s" % tokens)

Review Raw Text: A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle 

Tokenized review text: ['A', 'fantasy', 'adventure', 'that', 'fuses', 'Greek', 'mythology', 'to', 'contemporary', 'American', 'places', 'and', 'values', '.', 'Anyone', 'around', '15', '(', 'give', 'or', 'take', 'a', 'couple', 'of', 'years', ')', 'will', 'thrill', 'to', 'the', 'visual', 'spectacle']


In [18]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
filtered_sentence = [w for w in tokens if not w.lower() in stop_words]

# also lets remove duplicates
filtered_sentence = set(filtered_sentence)
print(filtered_sentence)

{'values', 'Anyone', 'places', 'mythology', 'adventure', 'thrill', 'couple', 'fantasy', 'visual', 'American', '15', 'Greek', ')', 'years', '(', 'spectacle', 'take', 'around', 'fuses', 'contemporary', '.', 'give'}


In [20]:
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'

nouns_and_adjectives = [word for (word, pos) in nltk.pos_tag(filtered_sentence) if is_noun(pos)] 
print(nouns_and_adjectives)

['values', 'Anyone', 'mythology', 'adventure', 'thrill', 'couple', 'fantasy', 'visual', 'American', 'Greek', 'years', 'spectacle', 'fuses', 'contemporary']


In [21]:
# this will create a dictionary of each term and the corresponding list of documents
inverted_index = {index: [1] for index in nouns_and_adjectives}
print(inverted_index)

{'values': [1], 'Anyone': [1], 'mythology': [1], 'adventure': [1], 'thrill': [1], 'couple': [1], 'fantasy': [1], 'visual': [1], 'American': [1], 'Greek': [1], 'years': [1], 'spectacle': [1], 'fuses': [1], 'contemporary': [1]}


In [22]:
# now save this to a file
with open('inverted_index.csv', 'w') as f:
    for key in inverted_index.keys():
        f.write("%s, %s\n" % (key,inverted_index[key]))

In [23]:
# get the list of stop words
stop_words = stopwords.words('english')
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
counter=0
inverted_index={}
for review in disney_data["Review_Text"] : 
    # tokenize the individual review
    tokens = nltk.word_tokenize(review)
    
    # remove the stop words
    filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
    
    # also lets remove duplicates
    filtered_sentence = set(filtered_sentence)
    
   
    nouns_and_adjectives = [word for (word, pos) in nltk.pos_tag(filtered_sentence) if is_noun(pos)] 
    
    # this will create a dictionary of each term and the corresponding list of documents
    for index in nouns_and_adjectives:
        try:
            inverted_index[index].append(counter)
        except: 
            inverted_index[index] = [counter]
    
    counter+=1

In [24]:
print(inverted_index)

{'walk': [0, 15, 34, 106, 160], 'Kong': [0, 4, 5, 21, 22, 23, 26, 27, 32, 33, 34, 35, 37, 44, 49, 51, 57, 58, 68, 71, 74, 77, 82, 89, 90, 93, 96, 98, 103, 120, 128, 139, 142, 143, 144, 148, 151, 156, 163, 164, 166, 172, 175, 177, 181, 189, 191, 199], 'Hong': [0, 4, 5, 21, 22, 23, 26, 27, 32, 33, 34, 35, 36, 37, 44, 49, 51, 57, 58, 68, 71, 74, 77, 78, 82, 89, 90, 93, 96, 98, 120, 128, 139, 142, 143, 144, 148, 151, 156, 163, 164, 166, 172, 175, 177, 181, 189, 191, 199], 'layout': [0], 'street': [0, 16, 37, 46, 90, 96, 126, 148, 171], 'hot': [0, 2, 4, 12, 29, 138, 198], 'queues': [0, 42, 83, 87, 90, 93, 98, 105, 114, 129, 144, 148, 157], 'fabulous': [0, 83, 148], 'rides': [0, 2, 5, 8, 14, 15, 21, 26, 27, 28, 29, 32, 34, 35, 37, 40, 45, 46, 48, 51, 54, 62, 70, 71, 72, 73, 74, 76, 77, 80, 81, 83, 85, 86, 87, 88, 92, 98, 104, 105, 106, 110, 114, 118, 122, 124, 125, 131, 132, 137, 143, 144, 149, 150, 155, 157, 158, 160, 161, 163, 166, 167, 180, 181, 182, 183, 187, 193, 196], 'Disneyland': [0,

In [25]:
with open('inverted_index.csv', 'w') as f:
    for key in inverted_index.keys():
        f.write("%s, %s\n" % (key,inverted_index[key]))