In [1]:
import pandas as pd
import spacy
import time
import nltk
from nltk import word_tokenize, pos_tag, pos_tag_sents

In [None]:
nltk.download('punkt')

In [2]:
nlp = spacy.load('en_core_web_lg')

In [23]:
doc = nlp(u"Jimmy went to Washington DC with Berlin Air")

In [16]:
for ent in doc.ents:
    print(f"Entity text: {ent.text:{15}}")
    print(f"Entity Label: {ent.label_:{8}}") 
    print(f"Meaning: {spacy.explain(ent.label_)}\n")

Entity text: Jimmy          
Entity Label: PERSON  
Meaning: People, including fictional

Entity text: Washington DC  
Entity Label: GPE     
Meaning: Countries, cities, states

Entity text: Berlin Air     
Entity Label: ORG     
Meaning: Companies, agencies, institutions, etc.



In [18]:
from spacy import displacy

In [24]:
displacy.render(doc,style="ent",jupyter=True,options={'distance':100})

In [None]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
stopwords

In [None]:
stopwords.remove('no')
stopwords.remove('against')
stopwords.remove('below')
stopwords.remove('cannot')
stopwords.remove('more')
stopwords.remove('much')
stopwords.remove('about')
stopwords.remove('above')
stopwords.remove('almost')
stopwords.remove('always')
stopwords.remove("n't")
stopwords.remove('never')
stopwords.remove('not')
stopwords.remove('n’t')
stopwords.remove('often')
stopwords.remove('because')
stopwords.remove('do')
stopwords.remove("will")


In [None]:
stopwords.add('i')
stopwords.add("i'm")
stopwords.add("it")
stopwords.add("my")
stopwords.add("we")
stopwords.add("he")
stopwords.add("she")
stopwords.add("this")
stopwords.add("it")
stopwords.add("that")
stopwords.add("\n")
stopwords.add("\n\n")

In [None]:
stopwords

In [None]:
all_reviews = pd.read_csv('processed_data/restaurant_az_reviews.csv').drop(labels='Unnamed: 0', axis=1).head(100000)

In [None]:
all_restaurants = pd.read_csv('processed_data/restaurants_az.csv').drop(labels='Unnamed: 0', axis=1)

In [None]:
all_reviews.shape

In [None]:
AZ

In [None]:
AZ = pd.DataFrame(all_reviews.groupby('review_stars').count()[['business_id']].rename(columns={'review_stars': 'Stars', 'business_id': 'Restaurants'})).head(100000)

In [None]:
AZ.plot.bar(colormap='Paired')

In [None]:
reviews = pd.DataFrame(all_reviews['text'])

In [None]:
reviews['text'].apply(len).mean()

In [None]:
reviews = all_reviews['text']

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# tokenizer
start = time.time()
A = reviews.apply(nlp)
end   = time.time()
print(f'Duration:   {end-start:{5}}')
print(A)

In [None]:
print(1751.9552311897278*100000/60/100000/4)

In [None]:
# tokenizer
start = time.time()
X = reviews.apply(word_tokenize)
end   = time.time()
print(f'Duration:   {end-start:{5}}')
print(X)

In [None]:
print(71.00524020195007*2/60)

In [None]:
# tagger
start = time.time()
X = pos_tag_sents( reviews.apply(word_tokenize).tolist(), lang='eng' )
end   = time.time()
print(f'Duration:   {end-start:{5}}')
print(X)

In [None]:
# plot how many reviews we have of each star
x = AZ.review_stars.value_counts().index
y = Restaurant_reviews.review_stars.value_counts().values

plot.figure(figsize=(8,5))
# colors are in the order 5, 4, 3, 1, 2
bar_colors = ['darkgreen', 'mediumseagreen', 'gold', 'crimson', 'orange']
plot.bar(star_x, star_y, color=bar_colors, width=.6)
plot.xlabel('Stars (Rating)')
plot.ylabel('Number of Reviews')
plot.title(f'Number of Reviews Per Rating of {businesses_to_analyse}')

In [None]:
def clean_doc(doc):
    # Remove punctuation, symbols (#) and stopwords
    doc = [tok.text for tok in doc if (tok.text.lower() not in stopwords and tok.pos_ != "PUNCT" and tok.pos_ != "SYM")]
    # Make all tokens lowercase
    doc = [tok.lower() for tok in doc]
    doc = ' '.join(doc).replace("n't",'not')
    return nlp.make_doc(doc)

In [None]:
docs = []
for index, restaurant in all_restaurants.iterrows():
    print(f'Restaurant : {restaurant["name"]} \n')
    for parsed_review in nlp.pipe(iter(all_reviews.query(' business_id == "'+restaurant['business_id']+'" ')['text']), batch_size=1000, n_threads=8):
        docs.append(parsed_review)
        print('\n-------------')
        print(parsed_review)
        print('\n-----')          
        print(clean_doc(parsed_review))
    if(index>0):
        break

In [None]:
docs = []
start = time.time()
startbatch = time.time()
print(f'Started at: {start:{20}}')
total = len(all_reviews)
#review['Parsed'] = all_reviews['text'].apply(nlp)
for parsed_doc in nlp.pipe(iter(all_reviews['text']), batch_size=1000, n_threads=4):
    docs.append(parsed_doc)
    if len(docs) % 1000 == 0:
        print(f'{len(docs)}/{total} processed in {time.time() - startbatch:>{5}} seconds')
        startbatch = time.time()
end   = time.time()
print(f'End at:     {end:{20}}')
print(f'Duration:   {end-start}')

In [None]:
start = time.time()
sample = all_reviews.head(10000)['text']
review['Parsed'] = sample.apply(nlp)
end   = time.time()
print(f'Duration:   {end-start:{5}}')

In [None]:
text = "God is Great! I won a lottery."
print(word_tokenize(text))