# Imports

In [30]:
import string
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

## Load dataset

In [34]:
# business = pd.read_json('yelp_academic_dataset_business.json', lines=True)
# checkin = pd.read_json('yelp_academic_dataset_checkin.json', lines=True)
# tip = pd.read_json('yelp_academic_dataset_tip.json', lines=True)
# user = pd.read_json('yelp_academic_dataset_user.json', lines=True)

df = pd.read_json('yelp_academic_dataset_review.json', lines=True)

## Select x, y

In [35]:
x = df['text']
y = df[['stars', 'useful', 'funny', 'cool']]

## Missing Data

In [36]:
df = df.fillna(0)

In [37]:
def remove_punctuation(text):
    # Using string.punctuation to get all punctuation characters
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [38]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(remove_punctuation)

In [8]:
df = df.head(5000)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features
x = tfidf_vectorizer.fit_transform(df['text'])

In [23]:
y = df[['stars', 'useful', 'funny', 'cool']]
# y = df[['stars', 'useful']]

## Train/Test split

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=0)

## Model

In [41]:
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

## Prediction

In [None]:
rf_classifier = RandomForestClassifier(n_jobs=-1)
multi_output_classifier = MultiOutputClassifier(rf_classifier, n_jobs=-1)
multi_output_classifier.fit(x_train_vec, y_train)

# Make predictions
predictions = multi_output_classifier.predict(x_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Display additional metrics (precision, recall, F1-score, etc.)
print(classification_report(y_test, predictions))