In [1]:
import string
from datetime import datetime
import os
import numpy as np
import pandas as pd

import nltk
# nltk.data.path.append("/")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from pyspark.ml.feature import CountVectorizer, IDF, PCA, Word2Vec, Tokenizer
from pyspark.ml.linalg import DenseVector, SparseVector, VectorUDT, Vectors
from pyspark.ml.clustering import GaussianMixture, KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, FloatType, IntegerType, DoubleType, ArrayType, StringType
spark_home = os.environ.get('SPARK_HOME', None)

import plotly
plotly.tools.set_credentials_file(username='amcire96', api_key='sej35ud4YbSOfIshhhZg')
# print(plotly.__version__)

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import requests
requests.packages.urllib3.disable_warnings()

In [3]:
reviews = spark.read.json("/user/eric_ma/Yelp/review.json").repartition(300)
print(reviews.count())

4153150


In [4]:
reviews.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [5]:
first = reviews.select(["text"]).where(col("text") == u'Nice little garden centre. Theres a friendly old dog sits at the front door to greet you as you walk in. The prices are competitive and there is good range of garden plants and furniture.')
first.collect()

[Row(text=u'Nice little garden centre. Theres a friendly old dog sits at the front door to greet you as you walk in. The prices are competitive and there is good range of garden plants and furniture.')]

In [9]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
reviews_tokenized = tokenizer.transform(reviews)
reviews_tokenized.head()

Row(business_id=u'O1ird5yRyuDFnOmYu90OoA', cool=0, date=u'2011-12-10', funny=0, review_id=u'D36tY6Pqsjq7-dITHSbpTg', stars=4, text=u"This is the perfect place to go for Happy Hour!  But get there early.  My friends and I got there around 6pm on a Friday and didn't have trouble getting a table.  But by the time we left around 8pm, there was a long wait.\n\nI ordered a few Dos Equis (which came in mason jars with lime - a nice Lawrenceville touch) and the tacos de seitan.  The tacos were nice and very fresh, compared to most cheese-laden Americanized Mexican food.  They were made with seasoned seitan, avocado salsa, onion, radish and cilantro on corn tortillas.  Gah, so delicious!", type=u'review', useful=0, user_id=u'jsokJRU0K190dFNMuWs_ow', words=[u'this', u'is', u'the', u'perfect', u'place', u'to', u'go', u'for', u'happy', u'hour!', u'', u'but', u'get', u'there', u'early.', u'', u'my', u'friends', u'and', u'i', u'got', u'there', u'around', u'6pm', u'on', u'a', u'friday', u'and', u"did

In [17]:
types = [f.dataType for f in reviews_tokenized.schema.fields]
types

[StringType, StringType, ArrayType(StringType,true)]

In [51]:
# Module-level global variables for the `tokenize` function below
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
# STEMMER = PorterStemmer()
LEMMATIZER = WordNetLemmatizer()

STOPWORDS.add(u'')

# Function to break text into "tokens", lowercase them, remove punctuation and stopwords, and stem them
def tokenize(text):
    tokens = word_tokenize(text)
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
#         print(punct_removed)
    no_stopwords = [w for w in no_punctuation if (not w in STOPWORDS) and (all(ord(letter) < 128 for letter in w))]
#     print(no_stopwords)
#     stemmed = [STEMMER.lemmatize(w) for w in no_stopwords]
#     print(stemmed)
#     return [w for w in stemmed if w]
    return no_stopwords

# print(tokenize("Nice little garden centre. Theres a friendly old dog sits at the front door to greet you as you walk in. The prices are competitive and there is good range of garden plants and furniture."))
# print(LEMMATIZER.lemmatize("little"))

tokenize_udf = udf(tokenize, ArrayType(StringType()))

first_words = first.select([col("text"), tokenize_udf(col("text")).alias("words")])
first_words.head()

# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# first_tokenized = tokenizer.transform(first)
# first_tokenized.head()

Row(text=u'Nice little garden centre. Theres a friendly old dog sits at the front door to greet you as you walk in. The prices are competitive and there is good range of garden plants and furniture.', words=[u'nice', u'little', u'garden', u'centre', u'theres', u'friendly', u'old', u'dog', u'sits', u'front', u'door', u'greet', u'walk', u'prices', u'competitive', u'good', u'range', u'garden', u'plants', u'furniture'])

In [52]:
reviews = reviews.select(["business_id", "text"])
reviews_tokenized = reviews.select([col("business_id"), col("text"), tokenize_udf(col("text")).alias("words")])
reviews_tokenized.head(50)

[Row(business_id=u'mYTem99eRslMJhWpfQOKGg', text=u'I was a regular there  about 2 years ago. But I stopped going and switched places due to them beginning to get rude and constantly asking me to come back later. I went back again today, thinking "oh maybe it\'s changed..."  I was wrong. I went again and after waiting they told me to come back later again instead of just telling me how long the wait would be or anything really instead of making me wait then telling me to come back later. It was rude and they have not changed.', words=[u'regular', u'2', u'years', u'ago', u'stopped', u'going', u'switched', u'places', u'due', u'beginning', u'get', u'rude', u'constantly', u'asking', u'come', u'back', u'later', u'went', u'back', u'today', u'thinking', u'oh', u'maybe', u'changed', u'wrong', u'went', u'waiting', u'told', u'come', u'back', u'later', u'instead', u'telling', u'long', u'wait', u'would', u'anything', u'really', u'instead', u'making', u'wait', u'telling', u'come', u'back', u'later',

In [None]:


# types = [f.dataType for f in reviews_tokenized.schema.fields]
# print(types)

word2vec = Word2Vec(vectorSize=5, inputCol="words", outputCol="word2vec", minCount=100)
word2vec_model = word2vec.fit(reviews_tokenized)
reviews_word2vec = word2vec_model.transform(reviews_tokenized)

reviews_word2vec.head()
