In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pyspark as ps

spark = ps.sql.SparkSession.builder \
        .master("local[4]") \
        .appName("df lecture") \
        .getOrCreate()

sc = spark.sparkContext

In [63]:
# %load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
import cleaner

%matplotlib inline

In [4]:
review_df = spark.read.json('../break_week/data/dataset/review.json')
user_df = spark.read.json('../break_week/data/dataset/user.json')
business_df = spark.read.json("../break_week/data/dataset/business.json")

In [5]:
review_df.createTempView("review")
user_df.createTempView("user")
business_df.createTempView("business")

In [6]:
df3 = spark.sql("SELECT new.user_name, new.user_id, new.business_id, new.friends, \
                b.name AS business_name, b.state, b.city, b.address, b.categories, b.stars AS bus_star,\
                new.text, new.stars AS review_star \
                FROM \
                    (SELECT u.name AS user_name, r.user_id, r.business_id, r.text, r.stars, u.friends \
                    FROM review AS r \
                    LEFT JOIN user AS u \
                    ON r.user_id = u.user_id) AS new\
                INNER JOIN business as b\
                ON new.business_id = b.business_id \
                WHERE ARRAY_CONTAINS(b.categories, 'Restaurants')")
df3.show(3)

+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|user_name|             user_id|         business_id|             friends|       business_name|state|     city|             address|          categories|bus_star|                text|review_star|
+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|   Justin|0y8ORuC2X1i1UF6SG...|--9e1ONYQuAa-CB_R...|[sf-8AusztxHc4o5b...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|WOW.

I came to V...|          5|
|    Shaun|9spixZHaqC1JeN1ld...|--9e1ONYQuAa-CB_R...|[jB5Imm55MMANvOlY...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|We had scheduled ...|          2|
|        J|A4GnBOU7Z

In [7]:
df3.createTempView("restuarant_review")

In [12]:
df_one = df3.where("business_id = 'zgGoxOsThXKAcs2lAgC9yg'").where("review_star = 1")

In [13]:
df_one.show()

+---------+--------------------+--------------------+--------------------+-------------+-----+------+--------------------+--------------------+--------+--------------------+-----------+
|user_name|             user_id|         business_id|             friends|business_name|state|  city|             address|          categories|bus_star|                text|review_star|
+---------+--------------------+--------------------+--------------------+-------------+-----+------+--------------------+--------------------+--------+--------------------+-----------+
|   Taylor|1HvleSOY1dn7KtHPv...|zgGoxOsThXKAcs2lA...|[VkKvpFLA9dHZ-2wM...|Buca di Beppo|   AZ|Peoria|16091 N Arrowhead...|[Food, Italian, P...|     3.0|My husband and I ...|          1|
|   Rachel|eJyg55id8Jf8r0fMA...|zgGoxOsThXKAcs2lA...|                  []|Buca di Beppo|   AZ|Peoria|16091 N Arrowhead...|[Food, Italian, P...|     3.0|This place is no ...|          1|
|   Bonnie|HWdquziFAQekwIS-O...|zgGoxOsThXKAcs2lA...|                 

In [14]:
corpus = list(df_one.toPandas()["text"])

In [25]:
sw = set(stopwords.words("english"))
tokenizer = RegexpTokenizer("[\w']+")
st = PorterStemmer()
lemma = WordNetLemmatizer()

In [27]:
cleaned = cleaner.clean_stem(corpus, tokenizer, lemma, sw)

### TFIDF Vectorizer

In [28]:
tfidf_vectorizer= TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(cleaned)

In [29]:
X = tfidf.toarray()
X.shape

(77, 618)

### LSA Model

In [30]:
lsa = TruncatedSVD(random_state=42)

In [31]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=42, tol=0.0)

In [32]:
terms = terms = tfidf_vectorizer.get_feature_names()

In [37]:
cleaner.show_topics(lsa.components_, terms, length=30)

Topic 1: ['food', 'table', 'minute', 'server', 'service', 'came', 'order', 'got', 'time', 'drink', 'manager', 'place', 'restaurant', 'asked', 'said', 'check', 'pasta', 'ordered', 'buca', 'like', 'waited', 'experience', 'chicken', 'good', 'told', 'bread', 'waitress', 'took', 'dinner', 'night']
Topic 2: ['italian', 'place', 'food', 'di', 'eat', 'sauce', 'dish', 'good', 'beppo', 'pasta', 'better', 'garden', 'olive', 'buca', 'tasteless', 'sausage', 'price', 'like', 'taste', 'huge', 'garlic', 'want', 'salad', 'meat', 'master', 'restaurant', 'feed', 'wife', 'joke', 'fair']


In [34]:
lsa.components_.shape

(2, 618)

### Applying LDA Model

In [43]:
lda = LatentDirichletAllocation(n_topics=2)
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=2,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [47]:
cleaner.show_topics(lda.components_, terms, length=30)

Topic 1: ['food', 'table', 'minute', 'server', 'service', 'salad', 'place', 'came', 'waited', 'good', 'time', 'bread', 'drink', 'like', 'dish', 'got', 'ordered', 'pasta', 'check', 'buca', 'chicken', 'min', 'asked', 'order', 'said', 'terrible', 'dinner', 'experience', 'restaurant', 'garlic']
Topic 2: ['food', 'manager', 'order', 'service', 'card', 'restaurant', 'coupon', 'got', 'ask', 'day', 'italian', 'place', 'asked', 'waiter', 'left', 'came', 'buca', 'family', 'party', 'customer', 'location', 'pasta', 'email', 'time', 'dinner', 'server', 'people', 'maybe', 'refill', 'know']


### Building the model with every sentence as a document

In [52]:
new_corpus = []
for sent in corpus:
    new_corpus.extend(sent.split(","))

In [57]:
cleaned = cleaner.clean_stem(new_corpus, tokenizer, lemma, sw)

In [58]:
tfidf_vectorizer= TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(cleaned)

In [59]:
X = tfidf.toarray()
X.shape

(446, 655)

### Applying LSA Model to sentence to doc

In [60]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=42, tol=0.0)

In [61]:
terms = terms = tfidf_vectorizer.get_feature_names()

In [62]:
cleaner.show_topics(lsa.components_, terms, length=30)

Topic 1: ['food', 'minute', 'table', 'service', 'time', 'order', 'came', 'server', 'got', 'drink', 'good', 'manager', 'waited', 'said', 'buca', 'place', 'like', 'ordered', 'pasta', 'asked', 'night', 'experience', 'check', 'chicken', 'restaurant', 'bread', 'went', 'dinner', 'left', 'took']
Topic 2: ['minute', 'salad', 'order', 'waited', 'table', 'drink', 'seated', 'server', 'got', 'waiting', 'bread', 'night', 'ordered', 'finally', 'came', 'wait', 'let', 'took', 'brought', 'gave', 'busy', 'plate', 'asked', 'waiter', 'walk', 'right', 'time', 'greet', 'bar', 'arrived']


### Applying LDA Model to sentence to doc

In [64]:
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=2,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [65]:
cleaner.show_topics(lda.components_, terms, length=30)

Topic 1: ['dish', 'salad', 'check', 'came', 'table', 'server', 'asked', 'terrible', 'finally', 'food', 'try', 'card', 'chicken', 'sauce', 'said', 'ordered', 'told', 'walked', 'like', 'meal', 'paying', 'arrived', 'restaurant', 'left', 'busy', 'ok', 'away', 'wait', 'manager', 'point']
Topic 2: ['food', 'place', 'buca', 'service', 'minute', 'order', 'good', 'experience', 'time', 'italian', 'choice', 'nice', 'night', 'got', 'di', 'table', 'went', 'manager', 'seated', 'mean', 'bread', 'family', 'restaurant', 'party', 'pasta', 'waiting', 'waited', 'birthday', 've', 'year']
