# Pulling Data from S3 and NLP'ing it 

In [1]:
import os
import re
import string
import sys
import time
import collections
# from awstools.awstools import s3
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# bucket = s3.Bucket('yelp-data-shared-labs18')

# print(bucket.contents)


# bucket.get('sample_data/review.parquet', 'sample_reviews.parquet')

reviews = pd.read_parquet('review.parquet')

print(reviews.head())


# Tokenizing function
def tokenize(s):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    word_list = re.findall(r'\w+', s.lower())
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return filtered_words


# Counting ngram function
def count_ngrams(lines, min_length=2, max_length=4):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            queue.append(word)
            if len(queue) >= max_length:
                add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()
        return ngrams


# for most frequent
def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-word phrase -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')


# for word cloud
def print_word_cloud(ngrams, num=5):
    """Print word cloud image plot """
    words = []
    for n in sorted(ngrams):
        for gram, count in ngrams[n].most_common(num):
            s = ' '.join(gram)
            words.append(s)

    cloud = WordCloud(width=1440, height=1080, max_words=200).generate(' '.join(words))
    plt.figure(figsize=(20, 15))
    plt.imshow(cloud)
    plt.axis('off');
    plt.show()
    print('')


def vectorize(t):
  """Vectorize the tokens"""
  # create the transformer TV, set length max_length=zzzz, spacy's vect defaults 
  TV = TfidfVectorizer()
  vect = CountVectorizer()

  # tokenize and build vocab
  return TV.fit_transform(t)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                      review_id  ...                date
609090   IfFo03nj-PQGTzTdxH8vWA  ... 2012-12-02 20:46:30
6460377  L46X3SlvO88xtlusGr1hFg  ... 2015-01-11 18:53:04
562100   EcxvP_LcXS2MGgxvNXwCwg  ... 2018-01-06 06:53:10
5339718  a1q77QOPmEJK75FgdfEjDA  ... 2015-05-12 15:25:27
2958710  gsPiT4oi6rdj9MZt8rfm7w  ... 2011-07-13 14:11:37

[5 rows x 9 columns]


In [2]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
609090,IfFo03nj-PQGTzTdxH8vWA,SxrGc9Qv8b_DWo6DZOeyng,FvVSy2r7_zDEhZWqLgjXNQ,3,0,2,0,The food was great no denying that. I'm trying...,2012-12-02 20:46:30
6460377,L46X3SlvO88xtlusGr1hFg,HOzgd-a-kAiVZappPn5yGA,ERfGoZcgrmF8p89FIs9_TA,5,2,0,1,"Delivered as promised, when promised!\n Excell...",2015-01-11 18:53:04
562100,EcxvP_LcXS2MGgxvNXwCwg,hmqaQ_60rCJi86ur1kYg8g,m65oi2C7b1CJM7DaXbFi-g,5,2,1,1,Came by again with a friend just to hang out. ...,2018-01-06 06:53:10
5339718,a1q77QOPmEJK75FgdfEjDA,qAeiX_1IOEjr56QjRjaBxA,RV__lDpb_Vo4SmSL2r5ezA,5,0,1,1,I called Real Fast Auto Glass to get the winds...,2015-05-12 15:25:27
2958710,gsPiT4oi6rdj9MZt8rfm7w,pFfZdrY_EbDlTxEwKbZl3A,YCoLBgh973QSp-OmnwMWSg,2,2,2,1,"Terrible shuttle service, pretty good service ...",2011-07-13 14:11:37


In [3]:
reviews['review_length'] = reviews['text'].apply(len)
reviews['token'] = reviews['text'].apply(tokenize)
reviews['token_vector'] = reviews['token'].apply(vectorize)
print("Tokenized words: ", reviews['token'].head(10))

ValueError: ignored

In [0]:
text = reviews['text']

In [0]:
bob = "Bob the builder is cool and he has many friends that accompany him."

In [46]:
tokenize(bob)

['bob', 'builder', 'cool', 'many', 'friends', 'accompany']

In [48]:
text.apply(tokenize)

609090     [food, great, denying, trying, cut, salt, inta...
6460377    [delivered, promised, promised, excellent, cus...
562100     [came, friend, hang, really, seem, like, fixed...
5339718    [called, real, fast, auto, glass, get, windshi...
2958710    [terrible, shuttle, service, pretty, good, ser...
                                 ...                        
3876331    [hearing, place, fiance, years, last, week, fi...
4899563    [coupon, 17, 99, decided, worth, try, told, 1,...
6383042    [place, killer, staff, awesome, menu, unreal, ...
2167329    [love, grounds, surround, franciscan, renewal,...
4647154    [phenomenal, mexican, restaurant, great, servi...
Name: text, Length: 26744, dtype: object

In [56]:
reviews['text'].apply(tokenize)

609090     [food, great, denying, trying, cut, salt, inta...
6460377    [delivered, promised, promised, excellent, cus...
562100     [came, friend, hang, really, seem, like, fixed...
5339718    [called, real, fast, auto, glass, get, windshi...
2958710    [terrible, shuttle, service, pretty, good, ser...
                                 ...                        
3876331    [hearing, place, fiance, years, last, week, fi...
4899563    [coupon, 17, 99, decided, worth, try, told, 1,...
6383042    [place, killer, staff, awesome, menu, unreal, ...
2167329    [love, grounds, surround, franciscan, renewal,...
4647154    [phenomenal, mexican, restaurant, great, servi...
Name: text, Length: 26744, dtype: object

In [40]:
vectorize(tokenize(bob))

<6x6 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [0]:
reviews['text'].apply(vectorize)

In [63]:
vectorize(reviews.text)

<26744x43351 sparse matrix of type '<class 'numpy.float64'>'
	with 1904766 stored elements in Compressed Sparse Row format>

Every row in review.tokens will be a sparse matrix 
