In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/ee562

/content/drive/MyDrive/ee562


In [None]:
import json
import pandas as pd
import numpy as np
import string
import itertools
import matplotlib.pyplot as plt
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string

In [None]:
def read_review_data(filename):
    reviews = []

    with open(filename) as file:

        for line in file:
            review = json.loads(line)

            #split the review to a list of words and remove punctuation
            words = review['text'].split()
            trans_table = str.maketrans('', '', string.punctuation)
            words = [w.translate(trans_table) for w in words]

            numKeyWords = 0

            # #find number of key words per review
            # for word in words:
            #     if word.lower() in keyWords:
            #         numKeyWords += 1

            #convert number of votes into classes
            upvotes = 1.0 * review['votes']['funny'] + 1.0 * review['votes']['useful'] + 1.0 * review['votes']['cool']
            # voteClass = -1
            # if score < 5:
            #     voteClass = 0 # not useful
            # elif score > 15:
            #     voteClass = 2 # useful
            # else:
            #     voteClass = 1 # somehow useful

            reviews.append({
                # 'UserId': review['user_id'],
                # 'BusinessId': review['business_id'],
                # 'StarRating': review['stars'],
                'ReviewLength': review['text'],
                # 'NumKeyWords': numKeyWords,
                'Upvotes': upvotes
            })

    return pd.DataFrame(reviews)

In [None]:
# review_df = read_review_data('drive/My Drive/yelp_Data/yelp_training_set_review.json')
review_df = read_review_data('/content/drive/MyDrive/ee562/yelp_training_set_review.json')

In [None]:
review_df

Unnamed: 0,ReviewLength,Upvotes,split_text
0,My wife took me here on my birthday for breakf...,7.0,"[My, wife, took, me, here, on, my, birthday, f..."
1,I have no idea why some people give bad review...,0.0,"[I, have, no, idea, why, some, people, give, b..."
2,love the gyro plate. Rice is so good and I als...,1.0,"[love, the, gyro, plate, Rice, is, so, good, a..."
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",3.0,"[Rosie, Dakota, and, I, LOVE, Chaparral, Dog, ..."
4,General Manager Scott Petello is a good egg!!!...,0.0,"[General, Manager, Scott, Petello, is, a, good..."
...,...,...,...
229902,I really wanted to like this place because it'...,0.0,"[I, really, wanted, to, like, this, place, bec..."
229903,My husband I stayed here for two nights. Of c...,2.0,"[My, husband, I, stayed, here, for, two, night..."
229904,Cool atmosphere. A lot of beers on tap and goo...,0.0,"[Cool, atmosphere, A, lot, of, beers, on, tap,..."
229905,I have to take a star off for the spotty servi...,3.0,"[I, have, to, take, a, star, off, for, the, sp..."


In [None]:
def remove_apostrophes(series):

    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                     if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):

    df[new_column_name] = df[column_name].str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
    df[new_column_name] = df[new_column_name].str.replace(r'\s+', ' ', regex=True).str.strip()
    df[new_column_name] = df[new_column_name].str.split(separator)

    return df
review_df = De_symbolize_and_split(review_df, 'ReviewLength', 'split_text',' ')
review_df['split_text'] = remove_apostrophes(review_df['split_text'])

In [None]:
review_df.ReviewLength

0         My wife took me here on my birthday for breakf...
1         I have no idea why some people give bad review...
2         love the gyro plate. Rice is so good and I als...
3         Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4         General Manager Scott Petello is a good egg!!!...
                                ...                        
229902    I really wanted to like this place because it'...
229903    My husband I stayed here for two nights.  Of c...
229904    Cool atmosphere. A lot of beers on tap and goo...
229905    I have to take a star off for the spotty servi...
229906                                         So cool, yo.
Name: ReviewLength, Length: 229907, dtype: object

In [None]:
review_list = review_df['ReviewLength'].tolist()

In [None]:
filtered_reviews = review_df[review_df['Upvotes'] > 5]

In [None]:
filtered_reviews_morethan1 = review_df[review_df['Upvotes'] > 1]

In [None]:
filtered_reviews_morethan1

Unnamed: 0,ReviewLength,Upvotes,split_text
0,My wife took me here on my birthday for breakf...,7.0,"[My, wife, took, me, here, on, my, birthday, f..."
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",3.0,"[Rosie, Dakota, and, I, LOVE, Chaparral, Dog, ..."
5,"Quiessence is, simply put, beautiful. Full wi...",8.0,"[Quiessence, is, simply, put, beautiful, Full,..."
6,Drop what you're doing and drive here. After I...,18.0,"[Drop, what, you're, doing, and, drive, here, ..."
10,The oldish man who owns the store is as sweet ...,5.0,"[The, oldish, man, who, owns, the, store, is, ..."
...,...,...,...
229900,The coffee was good.\nI had the two egg breakf...,2.0,"[The, coffee, was, good, I, had, the, two, egg..."
229901,I'm sorry to this but I must admit I was prett...,7.0,"[I'm, sorry, to, this, but, I, must, admit, I,..."
229903,My husband I stayed here for two nights. Of c...,2.0,"[My, husband, I, stayed, here, for, two, night..."
229905,I have to take a star off for the spotty servi...,3.0,"[I, have, to, take, a, star, off, for, the, sp..."


In [None]:
filtered_reviews

Unnamed: 0,ReviewLength,Upvotes,split_text
0,My wife took me here on my birthday for breakf...,7.0,"[My, wife, took, me, here, on, my, birthday, f..."
5,"Quiessence is, simply put, beautiful. Full wi...",8.0,"[Quiessence, is, simply, put, beautiful, Full,..."
6,Drop what you're doing and drive here. After I...,18.0,"[Drop, what, you're, doing, and, drive, here, ..."
16,We went here on a Saturday afternoon and this ...,9.0,"[We, went, here, on, a, Saturday, afternoon, a..."
18,I met a friend for lunch yesterday. \n\nLoved ...,15.0,"[I, met, a, friend, for, lunch, yesterday, Lov..."
...,...,...,...
229883,Every now and then a movie comes along and cha...,8.0,"[Every, now, and, then, a, movie, comes, along..."
229887,Although this joint is by far one of the best ...,7.0,"[Although, this, joint, is, by, far, one, of, ..."
229889,You want a monster sandwich that will challeng...,15.0,"[You, want, a, monster, sandwich, that, will, ..."
229897,"No, we just stopped serving breakfast.\nThe on...",9.0,"[No, we, just, stopped, serving, breakfast, Th..."


In [None]:
review_list = filtered_reviews['ReviewLength'].tolist()

In [None]:
all_text = ' '.join(review_list)

In [None]:
words = word_tokenize(all_text.lower())

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.isalpha() and word not in stop_words]


In [None]:
filtered_words = [word for word in words if word.isalpha() and word not in stop_words]

# 计算词频
word_counts = Counter(filtered_words)

# 获取最常见的前五十个单词
top_words = word_counts.most_common(50)

# 打印结果
for word, count in top_words:
    print(f"{word}: {count}")

In [None]:
top_words = word_counts.most_common(150)

In [None]:
word_counts = Counter(filtered_words)

In [None]:
for word, count in top_words:
    print(f"{word}: {count}")

like: 28867
place: 28249
good: 26544
food: 23879
one: 22722
get: 19720
would: 17339
really: 17285
great: 17167
time: 17152
go: 16032
back: 14276
also: 12074
little: 11690
even: 11603
could: 11461
well: 11204
service: 10913
know: 10503
people: 10073
got: 10002
us: 9807
love: 9624
much: 9607
nice: 9272
bar: 8874
think: 8725
pretty: 8640
menu: 8598
first: 8534
way: 8323
try: 8207
make: 8172
order: 7978
chicken: 7936
best: 7855
two: 7852
going: 7731
restaurant: 7684
ordered: 7606
see: 7598
right: 7589
never: 7528
night: 7493
always: 7210
cheese: 7209
went: 7088
want: 7065
say: 7052
made: 7047
around: 6716
better: 6671
came: 6653
still: 6485
come: 6399
day: 6370
lunch: 6334
something: 6328
new: 6264
sure: 6192
find: 6115
since: 6068
take: 6060
fresh: 5937
salad: 5892
eat: 5845
said: 5844
pizza: 5820
next: 5807
sauce: 5778
every: 5649
ever: 5590
delicious: 5581
table: 5570
friendly: 5553
many: 5508
thing: 5506
ca: 5464
wait: 5450
another: 5273
last: 5269
though: 5197
everything: 5162
phoenix