In [130]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer as ps
import igraph as ig
import tqdm

In [47]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\halan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\halan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\halan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\halan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\halan\AppData\Roaming\nltk_data...
[

True

In [None]:
# Test code to see how these functions work

text = "This is an example of a bigram in Python"
tokens = word_tokenize(text)  # Tokenizing the text
bigrams = list(ngrams(tokens, 2))  # Creating bigrams

In [None]:
# More test code

print(text)

print(bigrams)

This is an example of a bigram in Python
[('This', 'is'), ('is', 'an'), ('an', 'example'), ('example', 'of'), ('of', 'a'), ('a', 'bigram'), ('bigram', 'in'), ('in', 'Python')]


In [None]:
# Importing dataset
data = pd.read_csv('Sedona_Reviews_With_Sentiment.csv')

In [None]:
# Viewing dataset
data['Review Text']

0      Clean secure apartments in a great location wi...
1      Just excellent people at the Main Office. Clau...
2      I have never written a review before, but I ju...
3      So far the experience has been great and the r...
4      Extremely nice people at The Sedona. My apartm...
                             ...                        
178    Was top but is steadily declining. Halls are d...
179    The apartments are well-appointed, the staff i...
180    Up to now, I have been happy at The Sedona. A ...
181    I absolutely love living at The Sedona! Everyt...
182    Very professional staff and they keep the livi...
Name: Review Text, Length: 183, dtype: object

In [99]:
# Tokenizing text here

# Normal word_tokenizer splits up apostrophes
# TweetTokenizer does not: "it" "'s" vs "it's"

# Defining list
tokenizedText = []

# The way TweetTokenizer works it needs an instance for it to work
ttk = TweetTokenizer()

# Loop that will tokenize the text and append it to the list
for i in data['Review Text']:
    tokenizedText.append(list(ttk.tokenize(i)))

# Checking the text
print(tokenizedText[0])
print(tokenizedText[6])

['Clean', 'secure', 'apartments', 'in', 'a', 'great', 'location', 'with', 'a', 'wonderful', 'staff', 'always', 'ready', 'to', 'assist', 'you', '.', 'Grounds', 'are', 'well', 'kept', 'and', 'overall', "it's", 'an', 'extremely', 'peaceful', 'facility', '.', 'The', 'one', 'issue', 'we', 'had', 'was', 'addressed', 'within', 'the', 'hour', 'with', 'after', 'hours', 'service', '.']
['Sedona', 'Apartment', 'Homes', 'is', 'the', 'place', 'to', 'be', 'called', 'home', '.', 'The', 'property', 'is', 'astonishing', '...', 'The', 'lakes', 'are', 'breathtaking', '!', 'Peaceful', 'and', 'quiet', '.', 'The', 'property', 'itself', 'is', 'well', 'located', 'but', 'it', "doesn't", 'feel', 'like', 'you', 'are', 'in', 'the', 'middle', 'of', 'the', 'traffic', 'chaos', 'area', '.', 'I', 'love', 'this', 'place', 'in', 'many', 'ways', '!', '!', '!']


In [124]:
# Lemmatizing the text

# Defining the list
filteredSentence = []

# Defining the stop words
stopWords = set(stopwords.words('english'))

# Looping
for i in tokenizedText:
    filteredSentence.append([w for w in i if not w.lower() in stopWords])
    
print(filteredSentence[0])
print(filteredSentence[6])

['Clean', 'secure', 'apartments', 'great', 'location', 'wonderful', 'staff', 'always', 'ready', 'assist', '.', 'Grounds', 'well', 'kept', 'overall', 'extremely', 'peaceful', 'facility', '.', 'one', 'issue', 'addressed', 'within', 'hour', 'hours', 'service', '.']
['Sedona', 'Apartment', 'Homes', 'place', 'called', 'home', '.', 'property', 'astonishing', '...', 'lakes', 'breathtaking', '!', 'Peaceful', 'quiet', '.', 'property', 'well', 'located', 'feel', 'like', 'middle', 'traffic', 'chaos', 'area', '.', 'love', 'place', 'many', 'ways', '!', '!', '!']


In [None]:
# Creating the bigram

# Defining the list
bigramText = []
bigramTextLem = []

# Looping through all the text to create bigram
for i in tokenizedText:
    bigramText.append(list(ngrams(i, 2)))

for i in filteredSentence:
    bigramTextLem.append(list(ngrams(i, 2)))

[('Clean', 'secure'),
 ('secure', 'apartments'),
 ('apartments', 'great'),
 ('great', 'location'),
 ('location', 'wonderful'),
 ('wonderful', 'staff'),
 ('staff', 'always'),
 ('always', 'ready'),
 ('ready', 'assist'),
 ('assist', '.'),
 ('.', 'Grounds'),
 ('Grounds', 'well'),
 ('well', 'kept'),
 ('kept', 'overall'),
 ('overall', 'extremely'),
 ('extremely', 'peaceful'),
 ('peaceful', 'facility'),
 ('facility', '.'),
 ('.', 'one'),
 ('one', 'issue'),
 ('issue', 'addressed'),
 ('addressed', 'within'),
 ('within', 'hour'),
 ('hour', 'hours'),
 ('hours', 'service'),
 ('service', '.')]