In [1]:
import gzip
import math
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import pandas as pd

import html
import random
import json

In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

This takes a 1.5 mins to run

In [60]:
# allRatings = [l for l in parse("ratebeer.json.gz")]
rawRatings = [l for l in parse("sampled_ratebeer.json.gz")][0] # sampled has 300,000 reviews

# Sample and compress dataset (ALREADY RAN DONT RUN)

In [None]:
# # Random sample the dataset because it's too big

# sampleSize = 300000
# sampledAllRatings = random.sample(ratings, sampleSize)

In [None]:
# import json

# with open("sampled_ratebeer.json", "w") as f:
#     json.dump(sampledAllRatings, f)

In [None]:
# import gzip
# import shutil

# # File paths
# input_file = 'sampled_ratebeer.json'      # Your existing JSON file
# output_file = 'sampled_ratebeer.json.gz'  # Desired compressed file

# # Compress the JSON file
# with open(input_file, 'rb') as f_in:
#     with gzip.open(output_file, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

# print(f"Compressed {input_file} to {output_file}")


Compressed sampled_ratebeer.json to sampled_ratebeer.json.gz


# Clean data

Check missing labels (beer/style)

In [None]:
# Removing escape sequences from labels
for rating in rawRatings:
    if 'beer/style' in rating:
        rating['beer/style'] = html.unescape(rating['beer/style'])

In [62]:
styles = set()
no_style = []

for rating in rawRatings:
    if 'beer/style' in rating:
        styles.add(rating['beer/style'])
    else:
        no_style.append(rating)

len(styles), len(no_style)

(89, 0)

In [63]:
allKeys = set(rawRatings[0].keys())

Check missing features

In [76]:
def checkMissingFeatures(rating):
    return (
        rating['beer/name'] == "" or 
        rating['beer/ABV'] == "" or rating['beer/ABV'] == "-" or  # the - case is for some reason in our data
        rating['review/text'] == ""
    )
    
cleanedRatings = []

for rating in rawRatings:
    if not checkMissingFeatures(rating):
        cleanedRatings.append(rating)

# Get features
We get only the features we want to use ('beer/name', 'beer/ABV', 'review/text')

In [84]:
# Only contains features we will use
filteredRatings = []

for rating in cleanedRatings:
    featureDict = {
        'beer/name': rating['beer/name'],
        'beer/ABV':  rating['beer/ABV'],
        'review/text': rating['review/text'],
        'beer/style': rating['beer/style']
    }
    
    filteredRatings.append(featureDict)

In [85]:
len(filteredRatings)

285215

## Prepare the features
- Remove escape characters from `review/text`
- Convert `beer/ABV` to float

In [86]:
# https://stackoverflow.com/questions/8115261/how-to-remove-all-the-escape-sequences-from-a-list-of-strings
escapeChars = ''.join([chr(char) for char in range(1, 32)])
translator = str.maketrans('', '', escapeChars)

# Remove escape characters from `review/text`
def removeEscapeChars(review: str) -> str:
    return review.translate(translator)

# Convert `beer/ABV` to float
def convertABVFloat(abv: str) -> float:
    return float(abv)

for rating in filteredRatings: 
    rating['review/text'] = removeEscapeChars(rating['review/text'])
    rating['beer/ABV'] = convertABVFloat(rating['beer/ABV'])

# TODO: Do some analysis on our data
Ideas
- Statistics on length of review/text
- Statistics on ABV
- Statistics on how many times the name contains the beer style

In [97]:
reviewLens = []
beerABVs = []
nameContainsStyleCount = 0

for rating in filteredRatings:
    reviewLens.append(len(rating['review/text']))
    beerABVs.append(rating['beer/ABV'])
    if rating['beer/style'] in rating['beer/name']:
        nameContainsStyleCount += 1

In [None]:
import numpy as np

def getStats(data: list[int]) -> tuple:
    dataMin = min(data)
    dataMax = max(data)
    dataMean = np.mean(data)
    dataMedian = np.median(data)
    dataSTD = np.std(data)
    
    return dataMin, dataMax, dataMean, dataMedian, dataSTD

(3, 8449, 308.17741353014395, 254.0, 215.11712787742402)

In [102]:
reviewLenMin, reviewLenMax, reviewLenMean, reviewLenMedian, reviewLenSTD = getStats(reviewLens)
reviewLenMin, reviewLenMax, reviewLenMean, reviewLenMedian, reviewLenSTD

(3, 8449, 308.17741353014395, 254.0, 215.11712787742402)

In [103]:
abvMin, abvMax, abvMean, abvMedian, abvSTD = getStats(beerABVs)
abvMin, abvMax, abvMean, abvMedian, abvSTD

(0.01, 57.7, 6.645068281822484, 6.0, 2.3217380289064864)

# Split dataset

In [82]:
splitIndex = int(len(filteredRatings) * 0.95)
ratingsTrain = filteredRatings[:splitIndex]
ratingsTest = filteredRatings[splitIndex:]

len(ratingsTrain), len(ratingsTest)

(270954, 14261)

# Feature encoding

# Feature engineering for flavor labels

The `beer_mapping.json` file contains mappings for beer types to flavor labels generated by ChatGPT to group beers into flavors. A beer can be in multiple flavors.

In [None]:
import json

with open('beer_mapping.json') as f:
    beerFlavorMapping = json.load(f)

beerFlavorMapping.keys()

dict_keys(['crisp', 'malty', 'hoppy', 'roasty', 'fruity/spicy', 'sour', 'smoky', 'specialty', 'rich/hearty'])