In [1]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/elijah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elijah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/elijah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = df[column_name].str.lower()
    df[new_column_name] = df[new_column_name].str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
    df[new_column_name] = df[new_column_name].str.replace(r'\s+', ' ', regex=True).str.strip()
    df[new_column_name] = df[new_column_name].str.split(separator)
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 1, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())


In [5]:
df_business = read_json_to_dataframe("yelp_training_set/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("yelp_training_set/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("yelp_training_set/yelp_training_set_review.json")
df_user = read_json_to_dataframe("yelp_training_set/yelp_training_set_user.json")

In [6]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_total'] = df_review['votes'].apply(sum_votes)

In [7]:
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):

    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]


In [9]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)

In [10]:
df_words_counts = df_review["evaluative_words"].explode().value_counts().reset_index()
df_words_counts["evaluative_words"].head(100)

0            good
1            like
2           great
3            love
4            nice
         ...     
95           pita
96         chance
97    recommended
98         number
99          truly
Name: evaluative_words, Length: 100, dtype: object