In [1]:
import numpy as np
import json
import re
from collections import defaultdict

In [2]:
K = 5
TRAIN_DATA = "reviews_Digital_Music_5.json/Music_Review_train.json"
TEST_DATA = "reviews_Digital_Music_5.json/Music_Review_test.json"

In [3]:
def generate_vocabulary(file: str):
	vocab = defaultdict(lambda: [1 for _ in range(K)])
	Phi = np.zeros(K, np.float64)
	n = np.zeros(K, np.int32)
	with open(file) as f:
		for line in f:
			x = json.loads(line)
			rating = int(x['overall']) - 1
			Phi[rating] += 1
			for word in x['reviewText'].split():
				vocab[word][rating] += 1
				n[rating] += 1
	n += len(vocab)
	return vocab, n, np.log(Phi / np.sum(Phi))

In [4]:
log_prob = [dict() for _ in range(K)]
def get_log_prob(vocab: defaultdict, n: list[int], pred: int, word: str):
	if log_prob[pred].get(word):
		return log_prob[pred][word]
	if vocab.get(word) is None:
		log_prob[pred][word] = 0
	else:
		log_prob[pred][word] = np.log(vocab[word][pred] / n[pred])
	return log_prob[pred][word]

In [5]:
def predict(vocab: defaultdict, n: np.ndarray, x: list[str], Phi: np.ndarray):
	pred = -1
	best = -float("inf")
	for y, phi in enumerate(Phi):
		prob = phi
		for word in x:
			prob += get_log_prob(vocab, n, y, word)
		if prob > best:
			best = prob
			pred = y + 1
	return pred

In [6]:
def accuracy(vocab: defaultdict, n: np.ndarray, Phi: np.ndarray, file: str):
	with open(file) as f:
		m = 0
		correct = 0
		for line in f:
			x = json.loads(line)
			pred = predict(vocab, n, x['reviewText'].split(), Phi)
			correct += pred == int(x['overall'])
			m += 1
		return correct / m

In [7]:
def confusion_matrix(vocab: defaultdict, n: np.ndarray, Phi: np.ndarray, file: str):
	with open(file) as f:
		confusion = np.zeros((K, K), np.int32)
		for line in f:
			x = json.loads(line)
			confusion[int(x['overall']) - 1][predict(vocab, n, x['reviewText'].split(), Phi) - 1] += 1
		return confusion

In [8]:
vocab, n, Phi = generate_vocabulary(TRAIN_DATA)

In [9]:
print(accuracy(vocab, n, Phi, TRAIN_DATA))
print(accuracy(vocab, n, Phi, TEST_DATA))

0.72086
0.6656428571428571


In [10]:
print(confusion_matrix(vocab, n, Phi, TRAIN_DATA))
print(confusion_matrix(vocab, n, Phi, TEST_DATA))

[[  261     0    99   484  1685]
 [   18    85    54   989  1492]
 [   35     0  1193  1663  2743]
 [  107     1     2  9008  4149]
 [  135    12    29   260 25496]]
[[   2    0    0   21  205]
 [   0    0    0   63  263]
 [   2    0    1  231  852]
 [   3    0    0  280 2825]
 [  14    0    0  202 9036]]
