# Importing the necessary libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import nltk
import matplotlib.pyplot as plt # we only need pyplot
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
import random
import re
sb.set() # set the default Seaborn style for graphics

# Importing the dataset [ Need to change the location]

In [2]:
dataset = pd.read_csv('/Users/abhi/Downloads/reviewSelected100.csv')

# Removing Stop Words from the Tokenised sentence

In [3]:
def remove_stopwords(tokenized_sentence):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in tokenized_sentence if not w.lower() in stop_words]
    filtered_sentence = []
    for w in tokenized_sentence:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

# Tokenisation of a sentence

In [4]:
def tokenisation(sentence):
    word_tokens_with_stop = word_tokenize(sentence)
    word_tokens=remove_stopwords(word_tokens_with_stop)
    return word_tokens

# Lemmatization functions of sentence
### 1. Wordnet lemmatizer using nltk package

In [5]:
def wordnet_lemmatizer(sentence):
    tokenised_sentence=tokenisation(sentence)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [lemmatizer.lemmatize(w) for w in tokenised_sentence]
    return(lemmatized_sentence)

### 2. Google Bert package

# Stemming functions of sentence
### 1. Porter Stemmer using nltk package

In [6]:
def Porter_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    porter = PorterStemmer()
    Porter_stemming_sentence = [porter.stem(w) for w in tokenised_sentence]
    return(Porter_stemming_sentence)

### 2. Lancaster Stemmer using nltk package

In [7]:
def Lancaster_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    lancaster=LancasterStemmer()
    Lancaster_stemming_sentence = [lancaster.stem(w) for w in tokenised_sentence]
    return(Lancaster_stemming_sentence)

### 3. Google Bert package

# POS Tagging functions of sentence
### 1. Using nltk package

In [8]:
def nltk_pos_tagging(sentence):
    tokenised_sentence=tokenisation(sentence)
    Pos_Tag_Sentence=nltk.pos_tag(tokenised_sentence)
    return Pos_Tag_Sentence

### 2. Using google bert package

# Other functions needed

In [9]:
def random_business_id(df):
    n = random.randint(0,len(df.business_id)) 
    return df.business_id[n]


In [18]:
def business_review_extracter(business_id_to_check):
    reviews_text=[]
    reviews_sentences=[]
    reviews_tokens=[]
    for i in range(0,len(dataset.business_id)):
        if dataset.business_id[i]==business_id_to_check:
            reviews_text.append(dataset.text[i])
    for i in range(len(reviews_text)):
        reviews_sentences.append(nltk.tokenize.sent_tokenize(reviews_text[i]))
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            reviews_tokens.append(tokenisation(reviews_sentences[i][j]))
    return reviews_text,reviews_sentences,reviews_tokens
#display word frequency distribution here

In [19]:
def business_lancaster(reviews_sentences):
    lancaster_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            lancaster_stemmed.append(Lancaster_stemming(reviews_sentences[i][j]))
    return lancaster_stemmed

In [28]:
def business_poter(reviews_sentences):
    Porter_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            Porter_stemmed.append(Porter_stemming(reviews_sentences[i][j]))
    return Porter_stemmed

In [29]:
def business_wordnet(reviews_sentences):
    wordnet_lemmatized=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            wordnet_lemmatized.append(wordnet_lemmatizer(reviews_sentences[i][j]))
    return wordnet_lemmatized

# 3.2 Dataset Analysis
### a) Tokenisation, Stemming and lemmatization

In [30]:
B1=random_business_id(dataset)
reviews_B1_text,reviews_B1_sentences,reviews_B1_tokens = business_review_extracter(B1)
#display word frequency distribution here
Porter_Stemming_B1=business_poter(reviews_B1_sentences)
Lancaster_Stemming_B1=business_lancaster(reviews_B1_sentences)
Wordnet_lematization_B1=business_wordnet(reviews_B1_sentences)
#display word frequency distribution here
B2=random_business_id(dataset)
while(B1!=B2):
    B2=random_business_id(dataset)
reviews_B2_text,reviews_B2_sentences,reviews_B2_tokens = business_review_extracter(B2)
#display word frequency distribution here
Porter_Stemming_B1=business_poter(reviews_B2_sentences)
Lancaster_Stemming_B1=business_lancaster(reviews_B2_sentences)
Wordnet_lematization_B2=business_wordnet(reviews_B2_sentences)
#display word frequency distribution here

### b) POS Tagging

In [58]:
n=random.sample(range(0, len(dataset.text)), 5)
target=[]
reviews_tokens=[]
reviews_sentences=[]
required_sentences=[]
for i in range(0,5):
    target.append(dataset.text[n[i]])
    reviews_sentences.append(nltk.tokenize.sent_tokenize(target[i]))
for i in range(0,5):
    required_sentences.append(reviews_sentences[i][0])
print(required_sentences)
for i in range(0,len(required_sentences)):
    required_sentences[i]=nltk_pos_tagging(required_sentences[i])
print(required_sentences)

['I got the usual Kabob but tried fish from my mother wish I has ordered that!', 'Let me start by saying how much I hate Las Vegas but am forced to attend an annual conference there.', 'This place is disorganized.', 'The place is nothing to look at, but the tacos are delicious!', 'I was happy to see this in Terminal 4 pre-security and have been eating here for years.']
[[('I', 'PRP'), ('got', 'VBD'), ('usual', 'JJ'), ('Kabob', 'NNP'), ('tried', 'VBD'), ('fish', 'JJ'), ('mother', 'NN'), ('wish', 'JJ'), ('I', 'PRP'), ('ordered', 'VBD'), ('!', '.')], [('Let', 'VB'), ('start', 'VB'), ('saying', 'VBG'), ('much', 'JJ'), ('I', 'PRP'), ('hate', 'VBP'), ('Las', 'NNP'), ('Vegas', 'NNP'), ('forced', 'VBD'), ('attend', 'JJ'), ('annual', 'JJ'), ('conference', 'NN'), ('.', '.')], [('This', 'DT'), ('place', 'NN'), ('disorganized', 'VBD'), ('.', '.')], [('The', 'DT'), ('place', 'NN'), ('nothing', 'NN'), ('look', 'NN'), (',', ','), ('tacos', 'RB'), ('delicious', 'JJ'), ('!', '.')], [('I', 'PRP'), ('hap