In [111]:
import numpy as np
import json
import re
from collections import defaultdict, Counter

In [112]:
K = 5
TRAIN_DATA = "reviews_Digital_Music_5.json/Music_Review_train.json"
TEST_DATA = "reviews_Digital_Music_5.json/Music_Review_test.json"

In [113]:
def extract_data(file: str, split):
	with open(file) as f:
		X = []
		Y = []
		for line in f:
			X.append(defaultdict(int))
			x = json.loads(line)
			Y.append(int(x['overall']))
			for word in split(x['reviewText']):
				X[-1][word] += 1
		return np.array(X), np.array(Y)

In [122]:
def train_model(X: np.ndarray, Y: np.ndarray):
	vocab = defaultdict(lambda: np.ones(K))
	Phi = np.zeros(K, np.float64)
	n = np.zeros(K, np.int32)
	for x, y in zip(X, Y):
		y -= 1
		Phi[y] += 1
		for word, count in x.items():
			vocab[word][y] += count
			n[y] += count
	n += len(vocab)
	for word in vocab:
		vocab[word] = np.log(vocab[word] / n)
	return vocab, np.log(Phi / np.sum(Phi))

In [125]:
def predict_nb(vocab: defaultdict, Phi: np.ndarray, x: defaultdict):
	pred = -1
	best = -float("inf")
	for y, phi in enumerate(Phi):
		prob = phi
		for word in x:
			if word in vocab:
				prob += vocab[word][y] * x[word]
		if prob > best:
			best = prob
			pred = y + 1
	return pred

In [116]:
def accuracy(X: np.ndarray, Y: np.ndarray, prediction):
	return sum(prediction(X) == Y) / Y.shape[0]

In [117]:
def confusion_matrix(Y: np.ndarray, pred_Y: np.ndarray):
	confusion = np.zeros((K, K), np.int32)
	for y, pred_y in zip(Y, pred_Y):
		confusion[y - 1][pred_y - 1] += 1
	return confusion

In [123]:
def nb_default(output="output/default"):
	training_data = extract_data(TRAIN_DATA, str.split)
	test_data = extract_data(TEST_DATA, str.split)
	vocab, Phi = train_model(*training_data)
	naive_bayes = np.vectorize(lambda x: predict_nb(vocab, Phi, x))
	m = test_data[1].shape[0]
	training_pred = naive_bayes(training_data[0])
	test_pred = naive_bayes(test_data[0])
	with open(output, 'w+') as f:
		f.write("train_accuracy   = {}\n".format(accuracy(*training_data, lambda X: training_pred)))
		f.write("test_accuracy    = {}\n".format(accuracy(*test_data, lambda X: test_pred)))
		f.write("random_accuracy  = {}\n".format(accuracy(*test_data, lambda X: np.random.randint(1, 6, m))))
		f.write("mode_accuracy    = {}\n".format(accuracy(*test_data, lambda X: np.full(m, Counter(test_data[1]).most_common(1)[0][0]))))
		f.write("confusion_matrix (test) =\n{}\n".format(confusion_matrix(training_data[1], training_pred)))
		f.write("confusion_matrix (training) =\n{}".format(confusion_matrix(test_data[1], test_pred)))


In [126]:
nb_default()