In [45]:
# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sb
import collections
import nltk
import wordcloud
import matplotlib.pyplot as plt # we only need pyplot
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from textblob import TextBlob
from statistics import mean
from nltk.probability import FreqDist
import random
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


In [46]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [47]:
data = pd.read_csv("/Users/abhishekvaidyanathan/Desktop/NLP-project1/reviewSelected100.csv")

In [48]:
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


In [49]:
data['business_id'].nunique()

153

In [50]:
def remove_stopwords(tokenized_sentence):
    stop_words = set(stopwords.words('english'))
    stop_words = list(stop_words)
    filtered_sentence = []
    for w in tokenized_sentence:
        if w.lower() not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

In [51]:
def tokenisation(sentence):
    word_tokens_with_stop = word_tokenize(sentence)
    word_tokens_with_stop = [word for word in word_tokens_with_stop if word. isalpha()]

    # word_tokens_with_stop = [word for word in word_tokens_with_stop if word.lower()!="i"]
    word_tokens=remove_stopwords(word_tokens_with_stop)
    return word_tokens

In [52]:
# wordnet lemmatizer
def wordnet_lemmatizer(sentence):
    tokenised_sentence=tokenisation(sentence)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [lemmatizer.lemmatize(w) for w in tokenised_sentence]
    return(lemmatized_sentence)

In [53]:
# porter stemming
def Porter_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    porter = PorterStemmer()
    Porter_stemming_sentence = [porter.stem(w) for w in tokenised_sentence]
    return(Porter_stemming_sentence)

In [54]:
# lancaster stemming
def Lancaster_stemming(sentence):
    tokenised_sentence=tokenisation(sentence)
    lancaster=LancasterStemmer()
    Lancaster_stemming_sentence = [lancaster.stem(w) for w in tokenised_sentence]
    return(Lancaster_stemming_sentence)

In [55]:
# random business id
def random_business_id(df):
    n = random.randint(0,len(df.business_id)) 
    return df.business_id[n]

In [56]:
def business_review_extracter(dataset,business_id_to_check):
    reviews_text=[]
    reviews_sentences=[]
    reviews_tokens=[]
    for i in range(0,len(dataset.business_id)):
        if dataset.business_id[i]==business_id_to_check:
            reviews_text.append(dataset.text[i])
    for i in range(len(reviews_text)):
        reviews_sentences.append(nltk.tokenize.sent_tokenize(reviews_text[i]))
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            reviews_tokens.append(tokenisation(reviews_sentences[i][j]))
    return reviews_text,reviews_sentences,reviews_tokens
#display word frequency distribution here

In [57]:
def business_lancaster(reviews_sentences):
    lancaster_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            lancaster_stemmed.append(Lancaster_stemming(reviews_sentences[i][j]))
    return lancaster_stemmed

In [58]:
def business_poter(reviews_sentences):
    Porter_stemmed=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            Porter_stemmed.append(Porter_stemming(reviews_sentences[i][j]))
    return Porter_stemmed

In [59]:
def business_wordnet(reviews_sentences):
    wordnet_lemmatized=[]
    for i in range(len(reviews_sentences)):
        for j in range(len(reviews_sentences[i])):
            wordnet_lemmatized.append(wordnet_lemmatizer(reviews_sentences[i][j]))
    return wordnet_lemmatized

In [60]:
def nltk_pos_tagging(sentence):
    tokenised_sentence=tokenisation(sentence)
    Pos_Tag_Sentence=nltk.pos_tag(tokenised_sentence)
    return Pos_Tag_Sentence

In [61]:
def sentence_tokenizer(sentence):
    tokenised_sentence = []
    tokenised_sentence = sent_tokenize(sentence)
    return tokenised_sentence

In [62]:
def tokenize_sentences(sentence_array):
    sentence_tokens = []
    for sentences in sentence_array:
        tokenize_sent = nltk_pos_tagging(sentences)
        sentence_tokens.append(tokenize_sent)
    return sentence_tokens

In [63]:
def get_text_sentiment(lyrics):
  analysis = TextBlob(lyrics)
  return analysis.sentiment.polarity

In [64]:
def get_sentence_sentiment(sentence_array):
    sentences_sentiment = []
    for sentences in sentence_array:
        sentiment = get_text_sentiment(sentences)
        sentences_sentiment.append(sentiment)
    return sentences_sentiment

In [65]:
def count_noun_adjective_pairs(sentence_array,sentiment_array):
    count_array = []
    i = 0
    k = 0
    for pos_tags_array in sentence_array:
        pos_tags_count_array = []
        for pos_tags in range(len(pos_tags_array)):
            if (((pos_tags_array[pos_tags][1]) == "NN") or ((pos_tags_array[pos_tags][1]) == "NNS") or ((pos_tags_array[pos_tags][1]) == "NNP") or ((pos_tags_array[pos_tags][1]) == "NNPS")):
                i = pos_tags+1
                j = pos_tags-1
                while i <len(pos_tags_array) and j>=0:
                    if(pos_tags_array[j][1]=="JJ"):
                        adjective = wordnet_lemmatizer(pos_tags_array[j][0])[0].lower()
                        noun = wordnet_lemmatizer(pos_tags_array[pos_tags][0])[0].lower()
                        pos_tags_count_array.append([noun,adjective])
                        break
                    elif(pos_tags_array[i][1]=="JJ"):
                        adjective = wordnet_lemmatizer(pos_tags_array[i][0])[0].lower()
                        noun = wordnet_lemmatizer(pos_tags_array[pos_tags][0])[0].lower()
                        pos_tags_count_array.append([noun,adjective])
                        break
                    if(i<len(pos_tags_array)):
                        i = i+1
                    if(j-1>=0):
                        j = j-1
        count_array.append([pos_tags_count_array,len(pos_tags_count_array),sentiment_array[k]])
        k = k+1
        i = i+1

    return count_array

In [66]:
def extract_all_noun_adjective_pairs(noun_adjective_pairs,all_noun_adjective_pairs,all_noun_adjective_sentiments):
    for all_pairs in noun_adjective_pairs:
        if (all_pairs[0]!=[]):
            all_noun_adjective_pairs.extend(all_pairs[0])
            for i in range(all_pairs[1]):
                all_noun_adjective_sentiments.append(all_pairs[2])

In [67]:
def convert_list_to_tuple(all_noun_adjective_pairs):
    for i in range(len(all_noun_adjective_pairs)):
        all_noun_adjective_pairs[i] = tuple(all_noun_adjective_pairs[i])

    return all_noun_adjective_pairs

In [68]:
def get_most_common_word_pair(all_noun_adjective_pairs_tuple,top_k,indexes):
    counter=collections.Counter(all_noun_adjective_pairs_tuple)
    dict1 = dict(counter)
    final_list = counter.most_common(top_k)
    for i in range(len(all_noun_adjective_pairs_tuple)):
        for j in final_list:
            if(all_noun_adjective_pairs_tuple[i]==j[0]):
                indexes.append(i)
    return final_list

In [69]:
def get_count_for_specific_word_pairs(all_noun_adjective_pairs_tuple,key_value):
    counter=collections.Counter(all_noun_adjective_pairs_tuple)
    dict1 = dict(counter)
    return dict1[key_value]

In [70]:
def get_count_and_common_word_pairs(tuple1,tuple2):
    list1_as_set = set(tuple1)
    intersection = list1_as_set.intersection(tuple2)
    intersection_as_list = list(intersection)
    for common_elements in intersection_as_list:
        count1 = get_count_for_specific_word_pairs(tuple1,common_elements)
        count2 = get_count_for_specific_word_pairs(tuple2,common_elements)
        print("The common element is ",common_elements,". The count from from each of the tuples are: ",str([count1,count2]),".")


In [71]:
def get_top_frequent_pairs_words(data_rating,top_k):
    all_noun_adjective_pairs_rating = []
    all_noun_adjective_sentiments = []
    dummy_sentiments = []
    mean_sentiments = []
    indexes = []
    data_rating.apply(lambda row: extract_all_noun_adjective_pairs(row['noun_adjective_pairs'],all_noun_adjective_pairs_rating,all_noun_adjective_sentiments),axis=1)
    all_noun_adjective_pairs__rating_tuple = convert_list_to_tuple(all_noun_adjective_pairs_rating)
    top_frequent_pairs_rating = [get_most_common_word_pair(all_noun_adjective_pairs__rating_tuple,top_k,indexes)]
    # top_frequent_pairs_rating.append(indexes)
    i = 0
    for index in indexes:
        dummy_sentiments.append(all_noun_adjective_sentiments[index])

    for pairs in top_frequent_pairs_rating[0]:
        mean_sentiments.append(mean(dummy_sentiments[i:i+pairs[1]]))
        i = i+pairs[1]
    # top_frequent_pairs_rating.append(all_noun_adjective_sentiments)
    top_frequent_pairs_rating.append(mean_sentiments)

    return (top_frequent_pairs_rating)

In [72]:
def barplot_for_the_frequent_word_pairs(data,top_k):
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    frequency = []
    word_pairs = []
    for i in get_top_frequent_pairs_words(data,top_k)[0]:
        frequency.append(int(i[1]))
        word_pairs.append(str(i[0]))
    ax.bar(tuple(word_pairs),tuple(frequency))
    ax.set_ylabel('frequency')
    ax.set_xlabel('Word Pairs')
    ax.set_title('frequence of top '+str(top_k)+' word pairs')
    plt.xticks(rotation='vertical')
    plt.show()

In [200]:
def get_ner_tags(sentence_array):
    ner_tags = []
    # print(sentence_array['Sentence'])
    for sentence in sentence_array:
        # print(sentence)
        ner  = nlp(sentence)
        ner_copy = ner.copy()
        # ner_array = []
        i = 1
        while i < len(ner_copy):
            if(ner[i]['word'][:2]=="##" and ner[i-1]['entity']==ner[i]['entity']):
                ner_copy[i-1]['word'] = ner[i-1]['word'] + ner[i]['word'][2:]
                ner_copy[i-1]['score'] = (ner[i-1]['score'] + ner[i]['score'])/2
                ner_copy[i-1]['entity'] = ner[i-1]['entity']
                ner_copy[i-1]['index'] = ner[i-1]['index']
                ner_copy[i-1]['start'] = ner[i-1]['start']
                ner_copy[i-1]['end'] = ner[i]['end']
                ner_copy.remove(ner_copy[i])
                continue
            i = i+1
        ner_tags.append(ner_copy)
    return ner_tags


In [73]:
data_rating_1 = data[data['stars']==1]
data_random_new = data_rating_1.groupby('business_id').apply(lambda x: x.sample(1)).reset_index(drop=True)
data_random_50_rating_1 = data_random_new.sample(50)
data_random_50_rating_1.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
124,vY6UNbjwOT1eo7xc1ZgF2g,9GJ6XOBFBcokyG4GnVB4AQ,sj9osyqLyOy7b_kDZb1txA,1,0,0,0,Had high hopes after reading online reviews th...,2015-03-20 17:47:13
59,ny5uHRwJhVMc69aCT2y5wA,vRl2e5TmB3tSwPtfyMYnuw,RyaCGkXRXxXNeJhbnioz1Q,1,0,1,0,"Hope you like dregs, because that's all you'll...",2015-12-22 20:17:27
66,jcbHPpwJLP4uADiRLKOidA,JUbShoeYLmk76Q1n9yv9wQ,WA7sC64kCRstywm2EgZXEw,1,0,0,0,I had heard really good things about this rest...,2018-09-02 17:49:51
68,3GawFvNuqR1gIKYdSbsJiQ,URb0hVQv5jMuktO9odV83A,XA_m9daZl2VFDA6alnkBvg,1,0,0,0,Not impressed. I sampled the Tuna and it was p...,2017-08-18 00:43:46
130,cw_Z8y5J8ACeBX0kNQH4Rw,KgLN_fu-baMQkVCodvp9xw,vMpJzMFst_9GP4boeqWIRg,1,0,0,0,found hair in my sweet and sour chicken. enoug...,2016-06-15 20:46:07


In [123]:
def get_final_noun_adjective_pairs(data):
    data = data.copy()
    print("processing pos tags")
    data['pos_tags'] = data.apply(lambda row: nltk_pos_tagging(row['text']),axis=1)
    # print("processing text sentiments")
    # data['text_sentiment'] = data.apply(lambda row: get_text_sentiment(row['text']), axis=1)
    # print("processing sentence tokenizer")
    # data['sentence tokenizer'] = data.apply(lambda row: sentence_tokenizer(row['text']),axis=1)
    print("processing text sentiment")
    data['text_sentiment'] = data.apply(lambda row: get_text_sentiment(row['text']), axis=1)
    print("processing sentence tokenizer")
    data['sentence tokenizer'] = data.apply(lambda row: sentence_tokenizer(row['text']),axis=1)
    print("processing sentence pos tags")
    data["sentence_tokens_pos_tags"] = data.apply(lambda row: tokenize_sentences(row['sentence tokenizer']),axis=1)
    print("processing sentencs sentence")
    data['sentences_sentiment'] = data.apply(lambda row: get_sentence_sentiment(row['sentence tokenizer']),axis=1)
    print("processing noun adjective pairs")
    data["noun_adjective_pairs"] = data.apply(lambda row: count_noun_adjective_pairs(row['sentence_tokens_pos_tags'],row['sentences_sentiment']),axis=1)
    # print("processing bert ner tags")
    # data['bert_ner'] = data.apply(lambda row: get_ner_tags(row['sentence tokenizer']),axis=1)
    print('processing completed')
    # top_frequent = get_top_frequent_pairs_words(data,3)
    # print("processing done")

    return data



In [124]:
data_noun_adjective_pairs = get_final_noun_adjective_pairs(data)

processing pos tags
processing text sentiment
processing sentence tokenizer
processing sentence pos tags
processing sentencs sentence
processing noun adjective pairs
processing completed


In [125]:
top_frequent = get_top_frequent_pairs_words(data_noun_adjective_pairs,10)

In [126]:
top_frequent

[[(('time', 'first'), 615),
  (('food', 'good'), 470),
  (('place', 'great'), 380),
  (('food', 'great'), 302),
  (('service', 'great'), 274),
  (('service', 'good'), 257),
  (('place', 'good'), 255),
  (('time', 'next'), 247),
  (('time', 'last'), 231),
  (('hour', 'happy'), 172)],
 [0.368934837396971,
  0.38311848279773814,
  0.4053332217516193,
  0.41749379579914114,
  0.4189002272607428,
  0.39679488846051053,
  0.40746215915788747,
  0.3629659766851595,
  0.35484455419520355,
  0.3900579170809004]]

In [94]:
ner  = nlp(data.iloc[0]['text'].title())
if len(ner) != 0:
    print(ner)

[{'word': ',', 'score': 0.7667328715324402, 'entity': 'I-ORG', 'index': 27, 'start': 90, 'end': 91}, {'word': 'Music', 'score': 0.8218851685523987, 'entity': 'I-ORG', 'index': 28, 'start': 92, 'end': 97}, {'word': 'And', 'score': 0.9378843903541565, 'entity': 'I-ORG', 'index': 29, 'start': 98, 'end': 101}, {'word': 'Wait', 'score': 0.9378998875617981, 'entity': 'I-ORG', 'index': 30, 'start': 102, 'end': 106}, {'word': '##ers', 'score': 0.6082433462142944, 'entity': 'I-ORG', 'index': 31, 'start': 106, 'end': 109}, {'word': 'Lyle', 'score': 0.6591613292694092, 'entity': 'B-PER', 'index': 38, 'start': 131, 'end': 135}]


In [115]:
ner[0].keys()

dict_keys(['word', 'score', 'entity', 'index', 'start', 'end'])

In [107]:
ner_word = nlp('Shale Williams'.title())

In [108]:
ner_word

[{'word': 'S',
  'score': 0.9993637204170227,
  'entity': 'B-PER',
  'index': 1,
  'start': 0,
  'end': 1},
 {'word': '##hale',
  'score': 0.99187833070755,
  'entity': 'B-PER',
  'index': 2,
  'start': 1,
  'end': 5},
 {'word': 'Williams',
  'score': 0.997721791267395,
  'entity': 'I-PER',
  'index': 3,
  'start': 6,
  'end': 14}]

In [110]:
example = "My name is Wolfgang Shale Williams and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'word': 'Wolfgang', 'score': 0.9994356632232666, 'entity': 'B-PER', 'index': 4, 'start': 11, 'end': 19}, {'word': 'S', 'score': 0.9989044666290283, 'entity': 'I-PER', 'index': 5, 'start': 20, 'end': 21}, {'word': '##hale', 'score': 0.9992238879203796, 'entity': 'I-PER', 'index': 6, 'start': 21, 'end': 25}, {'word': 'Williams', 'score': 0.9986526966094971, 'entity': 'I-PER', 'index': 7, 'start': 26, 'end': 34}, {'word': 'Berlin', 'score': 0.9995962977409363, 'entity': 'B-LOC', 'index': 12, 'start': 49, 'end': 55}]


In [None]:
data_noun_adjective_pairs['bert_ner'] = data_noun_adjective_pairs.apply(lambda row: get_ner_tags(row['sentence tokenizer']),axis=1)

In [154]:
data.iloc[0]

review_id                                 8aoJJdKEO3ypoZNszpPu7Q
user_id                                   bGgAL09pxLnV_FFgR4ZADg
business_id                               ZBE-H_aUlicix_9vUGQPIQ
stars                                                          5
useful                                                         0
funny                                                          0
cool                                                           0
text           We had my Mother's Birthday Party here on 10/2...
date                                         2016-11-09 20:07:25
Name: 0, dtype: object

In [169]:
for row in range(0,10):
    print(row)
    print(get_ner_tags(data_noun_adjective_pairs.iloc[row]['sentence tokenizer']))

0
[[{'word': 'Birthday', 'score': 0.8212296366691589, 'entity': 'I-MISC', 'index': 7, 'start': 19, 'end': 27}, {'word': 'Party', 'score': 0.6545494198799133, 'entity': 'I-MISC', 'index': 8, 'start': 28, 'end': 33}], [], [], [{'word': 'Lyle', 'score': 0.8679577708244324, 'entity': 'B-PER', 'index': 2, 'start': 7, 'end': 11}], []]
1
[[{'word': 'Korean', 'score': 0.9950916767120361, 'entity': 'B-MISC', 'index': 2, 'start': 5, 'end': 11}, {'word': 'Eaton', 'score': 0.995835542678833, 'entity': 'B-LOC', 'index': 6, 'start': 23, 'end': 28}, {'word': 'Centre', 'score': 0.9984296560287476, 'entity': 'I-LOC', 'index': 7, 'start': 29, 'end': 35}], [], [], [], [], [], [], [], [], [], [{'word': 'B', 'score': 0.554015576839447, 'entity': 'B-MISC', 'index': 8, 'start': 25, 'end': 26}]]
2
[[], [], [], [], [], [], [], [], [], [], [], [], [{'word': 'Kealiali', 'score': 0.6973328590393066, 'entity': 'B-PER', 'index': 10, 'start': 37, 'end': 42}], [{'word': 'Ke', 'score': 0.9983730912208557, 'entity': 'B

In [170]:
data_noun_adjective_pairs.to_csv('/Users/abhishekvaidyanathan/Desktop/NLP-Assignment1/fullDataNounAdjectivePairs.csv',index=False)

In [176]:
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser

In [181]:
combined_reviews = data.groupby(['business_id'], as_index = False).agg({'text': '. '.join})
combined_reviews.text = combined_reviews.text.apply(lambda x: x.replace('\n', ''))
combined_reviews.text = combined_reviews.text.apply(lambda x: x.replace('\r', ''))

In [182]:
combined_reviews.head()

Unnamed: 0,business_id,text
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. But I never would have kno...
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...


In [196]:
def summarizer_lsa(text):
    parser=PlaintextParser.from_string(text,Tokenizer('english'))
    lsa_summarizer=LsaSummarizer()
    lsa_summary = lsa_summarizer(parser.document,100)
    summary = []
    for i in lsa_summary:
        summary.append(str(i))
    return summary

In [197]:
combined_reviews['summary'] = combined_reviews.apply(lambda row: summarizer_lsa(row['text']),axis=1)

In [198]:
combined_reviews.head()

Unnamed: 0,business_id,text,summary
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win...","[Had to get my wing fix, I like dry rubs on wi..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. But I never would have kno...,"[One time, we arrived around 5 on a weekday ho..."
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...,[However this location is the dirtiest I have ...
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,...","[Also, a killer name for someone who works in ..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...,[Different vibe all together and I found the m...


In [201]:
combined_reviews['bert_ner'] = combined_reviews.apply(lambda row: get_ner_tags(row['summary']),axis=1)

In [203]:
combined_reviews.head()

Unnamed: 0,business_id,text,summary,bert_ner
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win...","[Had to get my wing fix, I like dry rubs on wi...","[[], [], [], [], [], [], [], [], [{'word': 'Ap..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. But I never would have kno...,"[One time, we arrived around 5 on a weekday ho...","[[], [], [{'word': 'Was', 'score': 0.471216082..."
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...,[However this location is the dirtiest I have ...,"[[], [], [], [{'word': 'L', 'score': 0.5113500..."
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,...","[Also, a killer name for someone who works in ...","[[], [], [], [], [], [{'word': 'DD', 'score': ..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...,[Different vibe all together and I found the m...,"[[{'word': 'Val', 'score': 0.7798781991004944,..."


In [204]:
combined_reviews.to_csv('/Users/abhishekvaidyanathan/Desktop/NLP-Assignment1/combined_reviews_bert_ner.csv',index=False)

In [207]:
for i in range(combined_reviews.shape[0]):
    ner_tags = combined_reviews.iloc[i]['bert_ner']
    while [] in ner_tags:
        ner_tags.remove([])
    combined_reviews.at[row,'bert_ner']= ner_tags


In [209]:
combined_reviews.head()

Unnamed: 0,business_id,text,summary,bert_ner
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win...","[Had to get my wing fix, I like dry rubs on wi...","[[{'word': 'Apple', 'score': 0.651425600051879..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. But I never would have kno...,"[One time, we arrived around 5 on a weekday ho...","[[{'word': 'Was', 'score': 0.471216082572937, ..."
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...,[However this location is the dirtiest I have ...,"[[{'word': 'L', 'score': 0.5113500952720642, '..."
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,...","[Also, a killer name for someone who works in ...","[[{'word': 'DD', 'score': 0.529673159122467, '..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...,[Different vibe all together and I found the m...,"[[{'word': 'Val', 'score': 0.7798781991004944,..."


In [237]:
business_id = {}
for i in range(combined_reviews.shape[0]):
    word = []
    for j in range(len(combined_reviews.iloc[i]['bert_ner'])):
        for k in range(len(combined_reviews.iloc[i]['bert_ner'][j])):
            word.append(combined_reviews.iloc[i]['bert_ner'][j][k]['word'])
    
    business_id[combined_reviews.iloc[i]['business_id']] = word

In [239]:
for business in business_id:
    j = 0
    while j < len(business_id[business]):
        if(j>0 and business_id[business][j][:2] == '##'):
            business_id[business][j-1] = business_id[business][j-1] + business_id[business][j][2:]
            business_id[business].remove(business_id[business][j])
            continue
        j = j+1

In [241]:
business_review_id = {}
for business in business_id:
    word_dict = {}
    for word in business_id[business]:
        review_id = []
        for i in range(data.shape[0]):
            if(word in data.iloc[i]['text'].split()):
                review_id.append(data.iloc[i]['review_id'])
        word_dict[word] = review_id
    business_id[business] = word_dict


KeyboardInterrupt: 

In [242]:
business_review_id

{}

In [244]:
import json
  
with open('/Users/abhishekvaidyanathan/Desktop/NLP-Assignment1/business_review_id.txt', 'w') as convert_file:
     convert_file.write(json.dumps(business_id))