In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pyspark as ps

spark = ps.sql.SparkSession.builder \
        .master("local[4]") \
        .appName("df lecture") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
# %load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import cleaner

%matplotlib inline

In [4]:
review_df = spark.read.json('../break_week/data/dataset/review.json')
user_df = spark.read.json('../break_week/data/dataset/user.json')
business_df = spark.read.json("../break_week/data/dataset/business.json")

In [5]:
review_df.createTempView("review")
user_df.createTempView("user")
business_df.createTempView("business")

In [6]:
df3 = spark.sql("SELECT new.user_name, new.user_id, new.business_id, new.friends, \
                b.name AS business_name, b.state, b.city, b.address, b.categories, b.stars AS bus_star,\
                new.text, new.stars AS review_star \
                FROM \
                    (SELECT u.name AS user_name, r.user_id, r.business_id, r.text, r.stars, u.friends \
                    FROM review AS r \
                    LEFT JOIN user AS u \
                    ON r.user_id = u.user_id) AS new\
                INNER JOIN business as b\
                ON new.business_id = b.business_id \
                WHERE ARRAY_CONTAINS(b.categories, 'Restaurants')")
df3.show(3)

+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|user_name|             user_id|         business_id|             friends|       business_name|state|     city|             address|          categories|bus_star|                text|review_star|
+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|   Justin|0y8ORuC2X1i1UF6SG...|--9e1ONYQuAa-CB_R...|[sf-8AusztxHc4o5b...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|WOW.

I came to V...|          5|
|    Shaun|9spixZHaqC1JeN1ld...|--9e1ONYQuAa-CB_R...|[jB5Imm55MMANvOlY...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|We had scheduled ...|          2|
|        J|A4GnBOU7Z

In [7]:
df3.createTempView("restuarant_review")

In [12]:
df_one = df3.where("business_id = 'zgGoxOsThXKAcs2lAgC9yg'").where("review_star = 1")

In [13]:
df_one.show()

+---------+--------------------+--------------------+--------------------+-------------+-----+------+--------------------+--------------------+--------+--------------------+-----------+
|user_name|             user_id|         business_id|             friends|business_name|state|  city|             address|          categories|bus_star|                text|review_star|
+---------+--------------------+--------------------+--------------------+-------------+-----+------+--------------------+--------------------+--------+--------------------+-----------+
|   Taylor|1HvleSOY1dn7KtHPv...|zgGoxOsThXKAcs2lA...|[VkKvpFLA9dHZ-2wM...|Buca di Beppo|   AZ|Peoria|16091 N Arrowhead...|[Food, Italian, P...|     3.0|My husband and I ...|          1|
|   Rachel|eJyg55id8Jf8r0fMA...|zgGoxOsThXKAcs2lA...|                  []|Buca di Beppo|   AZ|Peoria|16091 N Arrowhead...|[Food, Italian, P...|     3.0|This place is no ...|          1|
|   Bonnie|HWdquziFAQekwIS-O...|zgGoxOsThXKAcs2lA...|                 

In [14]:
corpus = list(df_one.toPandas()["text"])

In [25]:
sw = set(stopwords.words("english"))
tokenizer = RegexpTokenizer("[\w']+")
st = PorterStemmer()
lemma = WordNetLemmatizer()

In [27]:
cleaned = cleaner.clean_stem(corpus, tokenizer, lemma, sw)

In [28]:
tfidf_vectorizer= TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(cleaned)

In [29]:
X = tfidf.toarray()
X.shape

(77, 618)

In [30]:
lsa = TruncatedSVD(random_state=42)

In [31]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=42, tol=0.0)

In [32]:
terms = terms = tfidf_vectorizer.get_feature_names()

In [35]:
cleaner.show_topics(lsa.components_, terms, length=30)

Topic 1: ['food', 'table', 'minute', 'server', 'service', 'came', 'order', 'got', 'time', 'drink', 'manager', 'place', 'restaurant', 'asked', 'said', 'check', 'pasta', 'ordered', 'buca', 'like', 'waited', 'experience', 'chicken', 'good', 'told', 'bread', 'waitress', 'took', 'dinner', 'night']
Topic 2: ['italian', 'place', 'food', 'di', 'eat', 'sauce', 'dish', 'good', 'beppo', 'pasta', 'better', 'garden', 'olive', 'buca', 'tasteless', 'sausage', 'price', 'like', 'taste', 'huge', 'garlic', 'want', 'salad', 'meat', 'master', 'restaurant', 'feed', 'wife', 'joke', 'fair']


In [34]:
lsa.components_.shape

(2, 618)