In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pyspark as ps

spark = ps.sql.SparkSession.builder \
        .master("local[4]") \
        .appName("df lecture") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
import cleaner

%matplotlib inline

In [4]:
review_df = spark.read.json('../break_week/data/dataset/review.json')
user_df = spark.read.json('../break_week/data/dataset/user.json')
business_df = spark.read.json("../break_week/data/dataset/business.json")

In [5]:
review_df.createTempView("review")
user_df.createTempView("user")
business_df.createTempView("business")

In [6]:
df = spark.sql("""SELECT new.user_name, new.user_id, new.business_id, new.friends, \
                b.name AS business_name, b.state, b.city, b.address, b.categories, b.stars AS bus_star,\
                new.text, new.stars AS review_star \
                FROM \
                    (SELECT u.name AS user_name, r.user_id, r.business_id, r.text, r.stars, u.friends \
                    FROM review AS r \
                    LEFT JOIN user AS u \
                    ON r.user_id = u.user_id) AS new\
                INNER JOIN business as b\
                ON new.business_id = b.business_id \
                WHERE ARRAY_CONTAINS(b.categories, 'Restaurants') \
                AND b.state IN ("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS", \
                                "KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY", \
                                "NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY") """)

In [7]:
df.count()

2598115

In [8]:
df_1_5 = df.filter("review_star = 1 OR review_star = 5")
df_1_5.show(2)

+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|user_name|             user_id|         business_id|             friends|       business_name|state|     city|             address|          categories|bus_star|                text|review_star|
+---------+--------------------+--------------------+--------------------+--------------------+-----+---------+--------------------+--------------------+--------+--------------------+-----------+
|   Justin|0y8ORuC2X1i1UF6SG...|--9e1ONYQuAa-CB_R...|[sf-8AusztxHc4o5b...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|WOW.

I came to V...|          5|
|        J|A4GnBOU7ZCTcoQK4e...|--9e1ONYQuAa-CB_R...|[MGPQVLsODMm9ZtYQ...|Delmonico Steakhouse|   NV|Las Vegas|3355 Las Vegas Bl...|[Cajun/Creole, St...|     4.0|This restaurant i...|          5|
+---------+---------

In [66]:
df_1_5.count()

1333392

In [9]:
df_1_5.select("state").groupBy("state").count().show(50,False)

+-----+------+
|state|count |
+-----+------+
|AZ   |492259|
|SC   |3481  |
|VA   |1     |
|NV   |543049|
|WI   |32850 |
|CA   |2     |
|NC   |95105 |
|IL   |11183 |
|IN   |12    |
|OH   |82870 |
|PA   |72525 |
|NY   |35    |
|CO   |4     |
|AK   |16    |
+-----+------+



In [10]:
df1 = df_1_5.where("state = 'WI'").select(["text", "review_star"])

In [11]:
# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# wordsData = tokenizer.transform(df1)

In [12]:
df_wi = df1.toPandas()

In [13]:
df_wi.head(10)

Unnamed: 0,text,review_star
0,Enjoyed a delicious meal with family on Friday...,5
1,Had a great time with family at this cool plac...,5
2,Came several times with my friends. Very good ...,5
3,Very disappointed. It used to be one my favori...,1
4,My favorite place to have chicken wings! Ike t...,5
5,Location is perfect is u r shopping or after a...,5
6,Newly opened Chinese home style cuisine. Great...,5
7,Great Belgian restaurant. Had mussels and frie...,5
8,One of the best and authentic Chinese restaura...,5
9,Love this place! Authentic and fresh dishes! T...,5


In [14]:
corpus = df_wi["text"]

In [15]:
corpus[3]

"Very disappointed. It used to be one my favorite restaurants in the town: fresh food, reasonable price and the freedom to make my own bowl! Now they changed their system so their ppl make your bowl. AND the female server who made my bowl, on oct 26, was very rude! She was rushing and making sure I didn't get too much of the food! I'm never coming back again. I will also spread the words to my friends not to come.\n\nPs: they charged for extra $2 for getting proteins, which I didn't know until I paid!! And this was very invisible on the menu!"

In [56]:
sw = set(stopwords.words("english"))
# sw.update(["i", "and", "i'm", "she", "he"])
tokenizer = RegexpTokenizer("[\w']+")
st = PorterStemmer()
lemma = WordNetLemmatizer()

In [57]:
cleaned = cleaner.clean_stem(corpus, tokenizer, lemma, sw)

In [58]:
cleaned[3]

"very disappointed it used one favorite restaurant town fresh food reasonable price freedom make bowl now changed system ppl make bowl and female server made bowl oct rude she rushing making sure i get much food i'm never coming back i also spread word friend come p charged extra getting protein i know i paid and invisible menu"

In [59]:
tfidf_vectorizer= TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(cleaned)

In [60]:
X = tfidf.toarray()
X.shape

(32850, 18102)

In [61]:
y = df_wi["review_star"]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [63]:
# nb = GaussianNB()
# nb.fit(X_train, y_train)

### ElasticNet Linear model

In [67]:
en = ElasticNet(alpha=0.001)
en.fit(X_train, y_train)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [65]:
#R^2 score
en.score(X_test, y_test)

0.16814933221760098

### Lasso Linear model

In [68]:
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [69]:
#R^2 score
lasso.score(X_test, y_test)

0.58374823080225324

### LogisticRegression Linear model

In [70]:
lreg = LogisticRegression()
lreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [71]:
lreg.score(X_test, y_test)

0.96724704736393519

In [72]:
terms = tfidf_vectorizer.get_feature_names()

In [73]:
cleaner.show_topics(lreg.coef_, terms, length=40)

Topic 1: ['great', 'delicious', 'amazing', 'best', 'excellent', 'love', 'awesome', 'favorite', 'friendly', 'perfect', 'fantastic', 'madison', 'definitely', 'loved', 'wonderful', 'good', 'happy', 'fresh', 'nice', 'perfectly', 'highly', 'tasty', 'outstanding', 've', 'enjoyed', 'fun', 'little', 'attentive', 'fast', 'super', 'yummy', 'incredible', 'thank', 'flavorful', 'bit', 'helpful', 'yum', 'try', 'notch', 'reasonable']


In [74]:
cleaner.show_topics(lreg.coef_, terms, length=40)

Topic 1: ['worst', 'terrible', 'horrible', 'bland', 'awful', 'rude', 'disappointing', 'poor', 'mediocre', 'bad', 'minute', 'told', 'ok', 'dry', 'asked', 'disgusting', 'tasted', 'cold', 'overpriced', 'tasteless', 'money', 'slow', 'dirty', 'ordered', 'gross', 'worse', 'customer', 'charged', 'frozen', 'left', 'flavorless', 'management', 'said', 'disappointment', 'waste', 'unfortunately', 'soggy', 'waited', 'barely', 'sorry']


In [75]:
df_1_5.select("review_star").groupBy("review_star").count().show()

+-----------+-------+
|review_star|  count|
+-----------+-------+
|          5|1031519|
|          1| 301873|
+-----------+-------+



In [118]:
pos_terms, neg_terms = cleaner.show_topics(lreg.coef_, terms, length=40)

In [119]:
pos_terms

[('great', 9.6339199219398921),
 ('delicious', 9.5086695708994373),
 ('amazing', 7.9170599102536379),
 ('best', 7.3589483682208217),
 ('excellent', 6.6895446573983914),
 ('love', 6.3849393517080557),
 ('awesome', 5.6882532659470213),
 ('favorite', 5.6378331048222989),
 ('friendly', 5.3344655463092208),
 ('perfect', 5.1487301606386211),
 ('fantastic', 4.9271576211302959),
 ('madison', 4.5292909003664485),
 ('definitely', 4.486343438415247),
 ('loved', 4.0306943032514306),
 ('wonderful', 3.9535371632915139),
 ('good', 3.8106663126652971),
 ('happy', 3.7261373823249087),
 ('fresh', 3.4802625832525842),
 ('nice', 3.3065880195360347),
 ('perfectly', 3.0823391451468418),
 ('highly', 2.9778623076666277),
 ('tasty', 2.9082167789912066),
 ('outstanding', 2.8545282365526417),
 ('ve', 2.8206297853641185),
 ('enjoyed', 2.6887189898520636),
 ('fun', 2.596468899456712),
 ('little', 2.5549889693513079),
 ('attentive', 2.5345851110853932),
 ('fast', 2.4638339969770398),
 ('super', 2.3439106507574947),

In [117]:
neg_terms

[('worst', -7.8428487355547984),
 ('terrible', -6.3079030915106147),
 ('horrible', -5.751475018400706),
 ('bland', -5.1065524302327372),
 ('awful', -4.8790553173531208),
 ('rude', -4.6553109680826168),
 ('disappointing', -4.5527040275278674),
 ('poor', -4.5292789281254802),
 ('mediocre', -4.5238552456152945),
 ('bad', -4.4682088611261186),
 ('minute', -4.313426659289445),
 ('told', -3.9543664624267665),
 ('ok', -3.858301806403829),
 ('dry', -3.8292530674849785),
 ('asked', -3.5862004447195974),
 ('disgusting', -3.5675109238113314),
 ('tasted', -3.536419801643381),
 ('cold', -3.448647333203243),
 ('overpriced', -3.3473641044353819),
 ('tasteless', -3.3292811357064571),
 ('money', -3.2726399181447468),
 ('slow', -3.2396165759347912),
 ('dirty', -3.2147656019520117),
 ('ordered', -3.2110760494267212),
 ('gross', -3.1794458033762432),
 ('worse', -3.1079818464443512),
 ('customer', -2.9925095943697189),
 ('charged', -2.9689180757806759),
 ('frozen', -2.8625362889855053),
 ('left', -2.862506