In [None]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]


In [None]:
df_business = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_review.json")
df_user = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_user.json")

In [None]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_weight'] = df_review['votes'].apply(sum_votes)
df_review['text_length'] = df_review['text'].apply(len)

In [None]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [None]:
df_review

In [None]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [None]:
# Calculate the total votes
df_user['votes_total'] = df_user['votes'].apply(lambda x: sum(x.values()))

# Calculate the ratio of votes to review_count
df_user['votes_per_review'] = df_user['votes_total'] / df_user['review_count']

# If division by 0
df_user['votes_per_review'] = df_user['votes_per_review'].replace([pd.NaT, pd.NaT], 0)

In [None]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'votes_weight', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id', 'votes_per_review']].copy()

merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')
#merged_df.fillna(0, inplace=True)

In [1]:
def divide_series(series, n_groups):
    total_count = series.sum()
    target_per_group = total_count / n_groups

    groups = {i: [] for i in range(n_groups)}
    group_sums = [0] * n_groups
    current_group = 0

    for number, count in series.items():
        groups[current_group].append(number)
        group_sums[current_group] += count
        if group_sums[current_group] >= target_per_group and current_group < n_groups - 1:
            current_group += 1

    return {f"Group {i + 1}": {"Numbers": groups[i], "Sum": group_sums[i]} for i in range(n_groups)}

series = merged_df['votes_weight'].value_counts().sort_index()
groups = divide_series(series, 4)

for group_name, group_info in groups.items():
    print(f"{group_name}: Numbers: {group_info['Numbers']}, Sum = {group_info['Sum']}")

number_to_group = {num: i+1 for i, (group, info) in enumerate(groups.items()) for num in info["Numbers"]}
merged_df['votes_category'] = merged_df['votes_weight'].apply(lambda x: number_to_group.get(x))

NameError: name 'merged_df' is not defined