# Importing the necessary libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import nltk
import matplotlib.pyplot as plt # we only need pyplot
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
import random
import re
sb.set() # set the default Seaborn style for graphics

In [2]:
import os
from  nltk.parse  import CoreNLPParser
java_path = "C:\\Program Files\\Java\\jdk-14.0.2\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

In [3]:
import scipy as sc
from collections import Counter

# Importing the dataset [ Need to change the location]

In [4]:
dataset = pd.read_csv('reviewSelected100.csv')

# Removing Stop Words from the Tokenised sentence

In [5]:
def remove_stopwords(tokenized_sentence):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in tokenized_sentence if not w.lower() in stop_words]
    filtered_sentence = []
    for w in tokenized_sentence:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

# Tokenisation of a sentence

In [6]:
def tokenisation(sentence):
    word_tokens_with_stop = word_tokenize(sentence)
    word_tokens=remove_stopwords(word_tokens_with_stop)
    return word_tokens

# Lemmatization functions of sentence
### 1. Wordnet lemmatizer using nltk package

In [7]:
def wordnet_lemmatizer(sentence):
    tokenised_sentence=tokenisation(sentence)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [lemmatizer.lemmatize(w) for w in tokenised_sentence]
    return(lemmatized_sentence)

### 2. Google Bert package

# Stemming functions of sentence
### 1. Porter Stemmer using nltk package

In [8]:
def Porter_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    porter = PorterStemmer()
    Porter_stemming_sentence = [porter.stem(w) for w in tokenised_sentence]
    return(Porter_stemming_sentence)

### 2. Lancaster Stemmer using nltk package

In [9]:
def Lancaster_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    lancaster=LancasterStemmer()
    Lancaster_stemming_sentence = [lancaster.stem(w) for w in tokenised_sentence]
    return(Lancaster_stemming_sentence)

### 3. Google Bert package

# POS Tagging functions of sentence
### 1. Using nltk package

In [10]:
def nltk_pos_tagging(sentence):
    tokenised_sentence=tokenisation(sentence)
    Pos_Tag_Sentence=nltk.pos_tag(tokenised_sentence)
    return Pos_Tag_Sentence

### 2. Using google bert package

# Other functions needed

In [11]:
def random_business_id(df):
    n = random.randint(0,len(df.business_id)) 
    return df.business_id[n]


In [12]:
def business_review_extracter(business_id_to_check):
    reviews_text=[]
    reviews_sentences=[]
    reviews_tokens=[]
    for i in range(0,len(dataset.business_id)):
        if dataset.business_id[i]==business_id_to_check:
            reviews_text.append(dataset.text[i])
    for i in range(len(reviews_text)):
        reviews_sentences.append(nltk.tokenize.sent_tokenize(reviews_text[i]))
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            reviews_tokens.append(tokenisation(reviews_sentences[i][j]))
    return reviews_text,reviews_sentences,reviews_tokens
#display word frequency distribution here

In [13]:
def business_lancaster(reviews_sentences):
    lancaster_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            lancaster_stemmed.append(Lancaster_stemming(reviews_sentences[i][j]))
    return lancaster_stemmed

In [14]:
def business_poter(reviews_sentences):
    Porter_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            Porter_stemmed.append(Porter_stemming(reviews_sentences[i][j]))
    return Porter_stemmed

In [15]:
def business_wordnet(reviews_sentences):
    wordnet_lemmatized=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            wordnet_lemmatized.append(wordnet_lemmatizer(reviews_sentences[i][j]))
    return wordnet_lemmatized

# 3.2 Dataset Analysis
### a) Tokenisation, Stemming and lemmatization

In [16]:
B1=random_business_id(dataset)
reviews_B1_text,reviews_B1_sentences,reviews_B1_tokens = business_review_extracter(B1)
#display word frequency distribution here
Porter_Stemming_B1=business_poter(reviews_B1_sentences)
Lancaster_Stemming_B1=business_lancaster(reviews_B1_sentences)
Wordnet_lematization_B1=business_wordnet(reviews_B1_sentences)
#display word frequency distribution here
B2=random_business_id(dataset)
while(B1!=B2):
    B2=random_business_id(dataset)
reviews_B2_text,reviews_B2_sentences,reviews_B2_tokens = business_review_extracter(B2)
#display word frequency distribution here
Porter_Stemming_B1=business_poter(reviews_B2_sentences)
Lancaster_Stemming_B1=business_lancaster(reviews_B2_sentences)
Wordnet_lematization_B2=business_wordnet(reviews_B2_sentences)
#display word frequency distribution here

### b) POS Tagging

In [17]:
n=random.sample(range(0, len(dataset.text)), 5)
target=[]
reviews_tokens=[]
reviews_sentences=[]
required_sentences=[]
for i in range(0,5):
    target.append(dataset.text[n[i]])
    reviews_sentences.append(nltk.tokenize.sent_tokenize(target[i]))
for i in range(0,5):
    required_sentences.append(reviews_sentences[i][0])
print(required_sentences)
for i in range(0,len(required_sentences)):
    required_sentences[i]=nltk_pos_tagging(required_sentences[i])
print(required_sentences)

['The Hashbrown Burrito not only satisfied my craving for a breakfast burrito, it exceeded my expectations!', 'We were immediately greeted and offered a free sample of the their bread.', 'Super simple place but amazing nonetheless.', 'Not much to say.', 'Had an amazing experience starting yesterday (5/31/17) and ending with a release today in the Surgical unit for a total hip replacement.']
[[('The', 'DT'), ('Hashbrown', 'NNP'), ('Burrito', 'NNP'), ('satisfied', 'VBD'), ('craving', 'VBG'), ('breakfast', 'NN'), ('burrito', 'NN'), (',', ','), ('exceeded', 'VBD'), ('expectations', 'NNS'), ('!', '.')], [('We', 'PRP'), ('immediately', 'RB'), ('greeted', 'VBD'), ('offered', 'VBN'), ('free', 'JJ'), ('sample', 'NN'), ('bread', 'NN'), ('.', '.')], [('Super', 'NNP'), ('simple', 'JJ'), ('place', 'NN'), ('amazing', 'VBG'), ('nonetheless', 'RB'), ('.', '.')], [('Not', 'RB'), ('much', 'JJ'), ('say', 'VBP'), ('.', '.')], [('Had', 'NNP'), ('amazing', 'JJ'), ('experience', 'NN'), ('starting', 'VBG'), (

# 3.3 Indicative Adjective Phrases

## Trying out Regex based Chunking for 3.3


In [None]:
def find_adjective_phrases(pos_sentence):
    adjective_phrases=[]
    chunk_grammer="AP: {<JJ.*>*}" 
    chunk_parser = nltk.RegexpParser(chunk_grammer)
    tree=chunk_parser.parse(pos_sentence)
    for subtree in tree.subtrees():
        if subtree.label()=="AP":
            adjective_phrases.append(subtree)
            
    return adjective_phrases

In [None]:
sentence="the little yellow dog barked at the cat"
pos_sentence=nltk_pos_tagging(sentence)
adjective_phrases=find_adjective_phrases(pos_sentence)
print(adjective_phrases)

## Trying out Training tagger based Chunker for 3.3

In [None]:
def conll_tag_chunks(chunk_data):
    #using the tree2conlltags but this only returns IOB tags for NP, PP and VP
    tagged_data = [tree2conlltags(tree) for 
                    tree in chunk_data]
      
    return [[(t, c) for (w, t, c) in sent] 
            for sent in tagged_data]

In [None]:
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

In [None]:
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tag import UnigramTagger, BigramTagger
  

      
class TagChunker(ChunkParserI):
      
    def __init__(self, train_chunks, 
                 tagger_classes =[UnigramTagger, BigramTagger]):
          
        train_data = conll_tag_chunks(train_chunks)
        self.tagger = combined_tagger(train_data,tagger_classes)
          
    def parse(self, tagged_sent):
        if not tagged_sent: 
            return None
          
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        wtc = zip(words, chunks)
          
        return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])

In [None]:
# loading libraries
from nltk.corpus import treebank_chunk
  
# data from treebank_chunk corpus
train_data = treebank_chunk.chunked_sents()[:3000]
test_data = treebank_chunk.chunked_sents()[3000:]
  
# Initailazing 
chunker = TagChunker(train_data)

## Trying out dependency parsing for 3.3

In [None]:
from nltk.parse.stanford import StanfordDependencyParser

In [None]:
import os
java_path = "C:\\Program Files\\Java\\jdk-14.0.2\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

In [None]:
# Path to CoreNLP jar unzipped
jar_path = 'C:\\Users\\Aratrika\\Desktop\\NTU COURSES_PROJECTS_HACKS\\NTU YEAR 3, SEM 1\\NLP\\Assignment 1\\stanford-corenlp-4.2.2\\stanford-corenlp-4.2.2\\stanford-corenlp-4.2.2.jar'

# Path to CoreNLP model jar
models_jar_path = 'C:\\Users\\Aratrika\\Desktop\\NTU COURSES_PROJECTS_HACKS\\NTU YEAR 3, SEM 1\\NLP\\Assignment 1\\stanford-corenlp-4.2.2-models-english.jar'

In [None]:
sentence='The movie was not too terrible'

In [None]:
# Initialize StanfordDependency Parser from the path
parser = StanfordDependencyParser(path_to_jar = jar_path, path_to_models_jar = models_jar_path)

# Parse the sentence
result = parser.raw_parse(sentence)
dependency = result.__next__()
#print(dependency)
dependency_tree=dependency.tree()
print(dependency_tree)
dependency_tree.pretty_print()

# print ("{:<15} | {:<10} | {:<10} | {:<15} | {:<10}".format('Head', 'Head POS','Relation','Dependent', 'Dependent POS'))
# print ("-" * 75)

## Trying out syntactic parsing for 3.3

In [None]:
# from nltk.parse.corenlp import CoreNLPServer

# STANFORD = os.path.join("models", "stanford-corenlp-full-2018-02-27")

# server = CoreNLPServer(
#     'C:\\Users\\Aratrika\\Desktop\\NTU COURSES_PROJECTS_HACKS\\NTU YEAR 3, SEM 1\\NLP\\Assignment 1\\stanford-corenlp-4.2.2\\stanford-corenlp-4.2.2\\stanford-corenlp-4.2.2.jar'
# ,'C:\\Users\\Aratrika\\Desktop\\NTU COURSES_PROJECTS_HACKS\\NTU YEAR 3, SEM 1\\NLP\\Assignment 1\\stanford-corenlp-4.2.2-models-english.jar',    
#  )

In [None]:
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
#use the above command inside the unzipped file, inorder to start the server


In [None]:
#for each business id, first get all the reviews, then run through syntactic parsing to create a list of all ADJP for each business id

In [18]:
#given a review sentence, returns all the adjective phrases
def find_adjective_phrases(sentence):
    def dfs(tree):
        adjp=tree.leaves()
        adjp=' '.join(word for word in adjp)
        return adjp
    adjective_phrases=[]
    parser = CoreNLPParser(url='http://localhost:9000')
    parse = next(parser.raw_parse(sentence))
    for adjective_phrase_tree in parse.subtrees(filter=lambda x: x.label() == 'ADJP'):
        #print(adjective_phrase_tree)
        adjective_phrase=dfs(adjective_phrase_tree)
        adjective_phrases.append(adjective_phrase)
    return adjective_phrases

In [19]:
sentences=['The movie was too terrible. I felt really bad!!','I had a really great time','Just had an amazing facial with Jane! She\'s knowledgable, excited about what she does, and let\'s you relax or talk as much as you want. I\'m a picky girl and I would pick her over the many facialists I\'ve been to all over the world. She doesn\'t push product but, listens enough to you needs to suggest smart additions to your lineup. Will definitely be back! Thanks Jane! See you in 4-6 weeks. Joye']
for i in sentences:
    adjp=find_adjective_phrases(i)
    print(adjp)

['too terrible', 'really bad']
['really great']
["facial with Jane ! She 's knowledgable , excited about what she does , and let 's you relax or talk as much as you want . I 'm a picky girl and I would pick her over the many facialists I 've been to all over the world . She does n't push product but , listens enough to you needs to suggest smart additions to your lineup . Will definitely be back !", 'knowledgable', "excited about what she does , and let 's you relax or talk as much as you want . I 'm a picky girl and I would pick her over the many facialists I 've been to all over the world . She does n't push product but , listens enough to you", 'much']


In [20]:
def adjective_phrases_per_business(business_id):
    ans=business_review_extracter(business_id)
    review_sentences=ans[1]
    #print(review_sentences[-1])
    business_adjective_phrases=[]
    for sentences in review_sentences:
        for sentence in sentences:
            sentence=sentence.lower()
            sentence = sentence.replace("%", " percent");
            print(sentence)
            adjp=find_adjective_phrases(sentence)
            business_adjective_phrases+=(adjp)

    print(business_adjective_phrases)
    return business_adjective_phrases

In [None]:
dataset = pd.read_csv('reviewSelected100.csv')
business_ids=dataset['business_id'].tolist()
business_id_set=set(business_ids)
for business_id in business_id_set:
    business_adjp=adjective_phrases_per_business(business_id)

In [47]:
def extract_adjective_phrases(dataset,business_b1_id):
    business_ids=dataset['business_id'].tolist()
    business_id_set=set(business_ids)
    business_adjps=[]
    business_b1=[]
    for business_id in business_id_set:
        business_adjp=adjective_phrases_per_business(business_id)
        if business_id==business_b1_id:
            business_b1=business_adjp
        else:
            business_adjps.append(business_adjp)
    return business_adjps,business_b1
    

In [None]:
#calling the extract_adjective_phrases function
dataset = pd.read_csv('reviewSelected100.csv')
business_b1_id= random_business_id(dataset)
business_adjps,business_b1=extract_adjective_phrases(dataset,business_b1_id)

this place is great.
the sangria is realllly tasty, and the tapas are wonderful.
it's located right across the street from the wynn, and at the end of the fashion show mall.
not a usual place for the typical vegas visitor, but it's not hard to find.
the service here was great as well.
would definitely come back here for some more tapas, a switch from the usual buffets and casino food courts :)
the food is excellent, but the plates are incredible small!
it was a challenge to eat off of a plate that was a quarter of the size of the utensils, but when good food is around i guess you can do anything.
i have been to both the chicago location and now the vegas location and both have excellent food and sangria!
i can't remember everything we ordered (uhh, we were in vegas!
), but i know i enjoyed the stuffed mushrooms and the beef tenderloin and blue cheese.
i also know i was very excited about the $1.99 dessert menu (the banana dessert and the chocolate truffle were good!).
if you like tapas

ba ba reeba is more expensive than other tapas places in town, due to its location, but i always seem to eat vegetarian here and so its a little less expensive.
if and when it is under a 100 degrees outside,  i like the patio because of the pretty view to wynn, but inside is really pretty also.
part of the lettuce entertain you group of restaurants, cafe ba ba reeba started out in chicago, then opened a branch in las vegas.
it specializes in tapas.
originally intended as bar snacks, the term now refers to small portions of food (at least in the united states).
choose a few and you have a meal; choose a bunch and you have a meal for a group of friends.
in addition to tapas, the menu offers paella and a few other items, but tapas are the stars.
there are more than 50 choices, plus another dozen desserts.
cold tapas include serrano ham, olives and chicken salad; hot tapas include octopus, empanadas and roast dates with bacon.
the inside dining areas are dark: dark stained wood flooring, d

i've only had tapas a few times and i'm not completely drawn to it because it sounds like i'll order a bunch of things and still not get full.
wrong.
well we ordered a lot because we were afraid of not getting full.
walked over here sunday afternoon and there was no wait.
awesome.
we got bread and some olive oil right away.
the olive oil was kinda bitter but i don't think its my thing.
this is what we ordered: 
- mahi mahi tapa (which was the special of the day)  very juicy with yummy veggies with it
- steamed mussels (my bf said it was good) 
- beef skewers (i like this one because of the carmelized onions and the spicy horseradish) 
- salmon with veggies
- seafood paella (soo yummy but remember it takes about 30 minutes to order) 
- black raspberry sangria half pitcher 16 bucks (this was my favorite part because of the fresh fruit!
you have to order the sangria!!
i really enjoyed the food, had lots of great decorations in the restaurant, and played spanish themed music in the backgro

In [None]:
business_adjp=adjective_phrases_per_business('e-YnECeZNt8ngm0tu4X9mQ')
print(business_adjp)

[['great', 'realllly tasty', 'wonderful', 'hard to find', 'great', 'usual', 'excellent', 'incredible small', 'very excited about the $ 1.99 dessert menu -LRB- the banana dessert and the chocolate truffle were good ! -RRB-', 'very excited', 'good', 'reasonably priced', 'weary of the high end expense account restaurants', 'fun and flavorful', 'good enough to keep us coming back', 'better than to fill up again with a cold pitcher of black raspberry sangria and small plate - size tapas to re-energize yourself for another round at the pool or craps table', 'better', 'quite extensive', 'cold and hot', 'good', 'excited about', 'pretty disappointed', 'as good', 'pretty dead', 'excellent', 'good', 'full', 'very good', 'wonderful', 'decent', 'a little boring', 'hot and comforting', 'baked in tomato', 'delicous', 'little pesto - topped', "so flavorful i 'm hooked", 'hooked', 'divine', 'down sweet and smooth', 'great happy', 'sad to see it go .', 'more focused on paella', 'more focused', 'quite go

In [None]:
#store adjective phrases of all businesses except the business_b1
business_adjps=[['right in front of u', 'amazing', 'sure to try a steak', 'worth', 'a little hesitant due to the pricing', 'Very nice', 'a nice extra but honestly completely unnecessary', 'a nice extra', 'completely unnecessary', 'very small and close to the restaurant', 'very small and close', 'good', 'very good', 'impeccable to everyone that walks through the door', 'hard to find in the latter half of my meal', 'polite', 'honest the keg does these dishes much much better and for less', 'much much better', 'less', 'good', 'nice and very garlicky', 'nice', 'very garlicky', 'excellent', 'delicious', 'a bit dry', 'so many', 'really good', 'worth an experience', 'very nice', 'justified', 'very nice', 'very nice', 'consistently good', 'really good', 'really good', 'awesome', 'really good in general', 'general', 'crisp', 'unusual', 'very great to munch on', 'bland', 'way more flavourful', 'very average', 'fully coated', 'more special', 'very good', 'medium rare just how i asked and the meat is soft , definitely not chewy', 'medium rare', 'soft', 'professional but not warm and welcoming', 'professional', 'warm and welcoming', 'able', 'really empathetic', 'really big for both of us', 'cheap', 'thankfully - because the place', 'thankfully'],
               ['really good','really good','superb food','too good','really brilliant'],
            ]

#store adjective phrases of business b1 for which we want to find indicative adjective phrases
business_b1=['good', 'bland', 'really flavourable', 'Even alone .', 'attentive , nice and considerate', 'way bigger', 'higher', 'better', 'interested', 'polite', 'so disappointed', 'a bit ridiculous that if one of us wanted to order the veggies , we all had to order it as well', 'subpar', 'dry and wilted', 'funny', 'overly seasoned', 'pretty bad', 'White', 'spicy', 'spicy and normal', 'too bad', 'very attentive', 'quick and courteous', 'Really bad', 'cold', 'Terrible', 'confusing', 'pretty pleased', 'ready', 'good', 'personable at all', 'super long and light', 'Unfortunate', 'great', ', eerily similar to that of the bowls they typically', 'Big', 'huge', 'quite good', 'bad enough to then be followed by finding a piece of ceramic in my food', 'well enough', 'worth', 'very small', 'quick', 'good', 'hard to come by', 'good', 'good', 'very poor', 'not sure what the lower level is for', 'super crowded', 'warm', 'surprised at how empty the store was for a Friday night', 'nice', 'decent', 'tiny', 'nice to have', 'too many', 'okay', 'as much', 'too bad', 'pretty prompt about it', 'decent for what you get', "too bad they do n't include dessert like their competitor", "more than I should 've paid until after I left", 'more', 'pretty stuffy and hot in there - bad ventilation - so', 'pretty stuffy and hot', 'there - bad', 'worst Korean', 'able to cook any of our food which just arrived', 'aged', 'Alot cheaper than chako .', 'Alot cheaper', 'bad', 'very disappointed since I was hoping to get korean food and got', 'very disappointed', 'pretty good and very well - priced -LRB- $ 9.99 for an all you can eat lunch -RRB-', 'pretty good', 'very well - priced', 'pretty unprofessional', 'definitely good for the price', 'available on weekends', 'worth', 'really Korean', 'serious', 'much larger', 'quite good', 'fresh , and very fatty', 'fresh', 'very fatty', 'so much', 'authentic', 'slow', 'small', 'slow and not that good', 'slow', 'not that good', 'pleasant for the most part from all other helpers', 'amazing', 'weak', 'worth', 'overbearing', 'able to upgrade this']
business_b1_dict=dict(Counter(business_b1))
business_adjps_list=[]
for x in business_adjps:
    business_adjps_list+=x
business_adjps_dict=dict(Counter(business_adjps_list))


for key in business_b1_dict:
    val1=business_b1_dict[key]
    if key in business_adjps_dict.keys():
        val2=business_adjps_dict[key]
    else:
        val2=0
    if key=='really good':
        print(val1, val2)
    business_b1_dict[key]=val1-val2
    
print(Counter(business_b1_dict).most_common(3))

In [None]:
business_b1_counter=Counter(business_b1)
business_b1_dict=dict(business_b1_counter)
business_b1_len=sum(business_b1_counter.values())

business_adjps_list=[]
for x in business_adjps:
    business_adjps_list+=x
business_adjps_counter=Counter(business_adjps_list)
business_adjps_len=sum(business_adjps_counter.values())
business_adjps_dict=dict(business_adjps_counter)

adjp_phrase=[]
freq_in_b1=[]
freq_overall=[]
for key in business_b1_dict.keys():
    adjp_phrase.append(key)
    freq_in_b1.append(business_b1_dict[key]/business_b1_len)
    if key in business_adjps_dict.keys():
        freq_overall.append(business_adjps_dict[key]/business_adjps_len)
    else:
        freq_overall.append(0)

relative_entropies=sc.special.rel_entr(freq_in_b1,freq_overall)
print(relative_entropies)
indicative_adjective_phrases = [x for _,x in sorted(zip(relative_entropies,adjp_phrase),reverse=True)]
print(indicative_adjective_phrases)
