In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gzip

In [2]:
def parseData(fname):
    for l in gzip.open(fname):
        yield eval(l)

data = list(parseData("C:/Users/iisab/OneDrive/Documents/School/Fall2023/CSE158/Assignment2/australian_user_reviews.json.gz"))

dm = [[0,0],[0,0]]

users = set()
games = set()

nodate = 0

reviews = []

for user in data:
    if user["user_id"] in users:
        #print(f"ducplicate user skipped: {user['user_id']}")
        pass
    else:
        users.add(user["user_id"])
        for review in user["reviews"]:
            games.add(review["item_id"])
            funny = review["funny"]
            hasfunny = int(funny != "")
            if funny == "":
                review["funny"] = 0
            else:
                review["funny"] = int(re.findall("\d+", funny)[0])
                
            helpful = review["helpful"]
            hashelpful = int(helpful != "No ratings yet")
            if helpful == "No ratings yet":
                review["helpful_n"] = 0
                review["helpful_total"] = 0
                review["helpful"] = 0
            else:
                nums = re.findall("\d+", helpful.replace(",", ""))
                helpfulness = float(nums[0]) / float(nums[1])
                review["helpful"] = float(nums[0]) / float(nums[1])
                review["helpful_n"] = float(nums[0])
                review["helpful_total"] = float(nums[1])
            
            dm[hasfunny][hashelpful] += 1

            try:
                post_datetime = datetime.strptime(review["posted"],'Posted %B %d, %Y.')
                review["posted"] = post_datetime
            except:
                nodate += 1

            review["user_id"] = user["user_id"]
            review["user_url"] = user["user_url"]
            reviews.append(review)

In [3]:
from collections import defaultdict
import string
from nltk.stem.porter import *

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for review in reviews:
	r = ''.join([c for c in review['review'].lower() if not c in punctuation])
	for w in r.split():
		w = stemmer.stem(w)
		wordCount[w] += 1
		
len(wordCount)

97248

In [5]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
words[:20]

['the',
 'game',
 'and',
 'a',
 'to',
 'it',
 'is',
 'i',
 'of',
 'you',
 'thi',
 'in',
 'for',
 'that',
 'play',
 'with',
 'but',
 'have',
 'on',
 'be']

In [None]:
# NOT GONNA USE DIDN'T PERFORM WELL AT ALL
# wordCount = defaultdict(int)
# punctuation = set(string.punctuation)
# def feature(datum):
#     feat = [0]*len(words)
#     r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
#     ws = r.split()
#     ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
#     ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
#     ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
#     ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
#     for w in ws + ws2 + ws3 + ws4 + ws5:
#         if w in words:
#             feat[wordId[w]] += 1
#     feat.append(1) #offset
#     return feat

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
import math
df = defaultdict(int)

train, test = train_test_split(reviews, test_size=0.25, random_state=0)

for d in train:
    r = ''.join([c for c in d['review'].lower() if not c in punctuation])
    # for each word in the text
    for w in set(r.split()):
        # if this particular word is one we are looking for add 1 to document frequency
        if w in words:
            df[w] += 1

In [9]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
    for word in r.split():
        if word in words:
            feat[wordId[word]] += 1
    # offset
    
    return [1] + [feat[wordId[w]] * math.log10(len(train) / df[w]) for w in words if df[w] > 0]

In [25]:
threshold = 1

baseline_accuracy = np.sum([0 if review["helpful_n"] < threshold else 1 for review in reviews]) / len(reviews) # accuracy
baseline_error_rate = 1 - np.sum([0 if review["helpful_n"] < threshold else 1 for review in reviews]) / len(reviews) # error rate
print("baseline accuracy: " + str(baseline_accuracy) + "\nbaseline error rate: " + str(baseline_error_rate))

baseline accuracy: 0.3884819442067431
baseline error rate: 0.6115180557932569


In [14]:
X_train = [feature(review) for review in train]
#Y_funny_train = [1 if review["funny"] > 25 else 0 for review in train]
Y_helpful_train = [1 if review["helpful_n"] > 1 else 0 for review in train] # try with higher threshold 5, 10

X_test = [feature(review) for review in test]
#Y_funny_test = [1 if review["funny"] > 25 else 0 for review in test]
Y_helpful_test = [1 if review["helpful_n"] > 1 else 0 for review in test] # try with higher threshold 5, 10


In [15]:
from sklearn import linear_model
#mod_funny = linear_model.LogisticRegression(C=1, max_iter=1000)
mod_helpful = linear_model.LogisticRegression(C=1, max_iter=1000)

In [16]:
#mod_funny.fit(X_train, Y_funny_train)
mod_helpful.fit(X_train, Y_helpful_train)

#predictions_funny = mod_funny.predict(X_test)
predictions_helpful = mod_helpful.predict(X_test)

#correct_funny = predictions_funny == Y_funny_test
correct_helpful = predictions_helpful == Y_helpful_test

In [26]:
#accuracy_funny = sum(correct_funny) / len(correct_funny)
accuracy_helpful = sum(correct_helpful) / len(correct_helpful)
#print("accuracy_funny: " + str(accuracy_funny) + ", accuracy_helpful: " + str(accuracy_helpful))
error_rate_helpful = 1 - accuracy_helpful
print("accuracy_helpful: " + str(accuracy_helpful) + "\nerror_rate: " + str(error_rate_helpful))

accuracy_helpful: 0.7983296823658269
error_rate: 0.2016703176341731
