In [1]:
%load_ext autoreload
%autoreload 2

# MISC
import gzip
import multiprocessing
import time
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from shared import *
cpu_cores = multiprocessing.cpu_count()
data_path_50MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Gift_Cards.json'
data_path_263MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Appliances.json'
data_path_755MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Digital_Music.json'
data_path_2GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Video_Games.json'
data_path_10GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/reviews_Books_1.json'
data_path_38GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Books_big.json'

# NLTK
# Note, you may have to download 'Punkt' and other things through NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

## Pandas Operations

In [2]:
begin = time.time()
start = begin

reviews_df = pd.read_json(data_path_50MB, orient='records', lines=True)

## Drop unwanted columns / keep good ones
columns = ['reviewerID', 'reviewText']
reviews_df = reviews_df[columns]
print_elapsed(start)

## Drop empty (NaN) rows
start = time.time()
reviews_df = reviews_df.dropna()
print_elapsed(start)

## Tokenize words in review
start = time.time()
reviews_df['tokenized_dirty'] = reviews_df['reviewText'].apply(nltk.word_tokenize)
print_elapsed(start)

## Remove stop words
start = time.time()
reviews_df['tokenized_review'] = reviews_df['tokenized_dirty'].apply(lambda tokens: [w for w in tokens if w.lower() not in stop_words])

## Perform Part-Of-Speech (POS) tagging on tokens
start = time.time()
reviews_df['tok/pos'] = reviews_df['tokenized_review'].apply(my_pos_tag)
print_elapsed(start)

## Stem words in tokenized review
start = time.time()
reviews_df['stemmed_review'] = reviews_df['tokenized_review'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])
print_elapsed(start)

## Lemmatize words in tokenized review
start = time.time()
reviews_df['lemmatized_review'] = reviews_df['tok/pos'].apply(lambda tokens: [lemmatizer.lemmatize(t, pos) for (t, pos) in tokens])
print_elapsed(start)

print(f'Total execution time: {time.time() - begin}')


Elapsed time for operation: 0.5522429943084717
Elapsed time for operation: 0.02384471893310547
Elapsed time for operation: 13.681278944015503
Elapsed time for operation: 48.012821197509766
Elapsed time for operation: 14.896658897399902
Elapsed time for operation: 3.067193031311035
Total execution time: 80.7833411693573


In [3]:
reviews_df

Unnamed: 0,reviewerID,reviewText,tokenized_dirty,tokenized_review,tok/pos,stemmed_review,lemmatized_review
0,APV13CM0919JD,"Amazon,\nI am shopping for Amazon.com gift car...","[Amazon, ,, I, am, shopping, for, Amazon.com, ...","[Amazon, ,, shopping, Amazon.com, gift, cards,...","[(Amazon, n), (,, n), (shopping, v), (Amazon.c...","[amazon, ,, shop, amazon.com, gift, card, chri...","[Amazon, ,, shop, Amazon.com, gift, card, Chri..."
1,A3G8U1G1V082SN,"I got this gift card from a friend, and it was...","[I, got, this, gift, card, from, a, friend, ,,...","[got, gift, card, friend, ,, best, !, site, mu...","[(got, v), (gift, a), (card, n), (friend, n), ...","[got, gift, card, friend, ,, best, !, site, mu...","[get, gift, card, friend, ,, best, !, site, mu..."
2,A11T2Q0EVTUWP,aren't we going to save trees?! :) People who ...,"[are, n't, we, going, to, save, trees, ?, !, :...","[n't, going, save, trees, ?, !, :, ), People, ...","[(n't, r), (going, v), (save, v), (trees, n), ...","[n't, go, save, tree, ?, !, :, ), peopl, compl...","[n't, go, save, tree, ?, !, :, ), People, comp..."
3,A9YKGBH3SV22C,You can always get someone something from Amaz...,"[You, can, always, get, someone, something, fr...","[always, get, someone, something, Amazon, safe...","[(always, r), (get, v), (someone, n), (somethi...","[alway, get, someon, someth, amazon, safeti, n...","[always, get, someone, something, Amazon, safe..."
4,A34WZIHVF3OKOL,Why take 50 dollars of good money with no limi...,"[Why, take, 50, dollars, of, good, money, with...","[take, 50, dollars, good, money, limitations, ...","[(take, v), (50, n), (dollars, n), (good, a), ...","[take, 50, dollar, good, money, limit, ,, turn...","[take, 50, dollar, good, money, limitation, ,,..."
...,...,...,...,...,...,...,...
147189,A2K9WVQW9TLWNK,I always enjoy getting these Gift cards via em...,"[I, always, enjoy, getting, these, Gift, cards...","[always, enjoy, getting, Gift, cards, via, ema...","[(always, r), (enjoy, v), (getting, v), (Gift,...","[alway, enjoy, get, gift, card, via, email, lo...","[always, enjoy, get, Gift, card, via, email, L..."
147190,A149ALSR6TPGF7,Worked great,"[Worked, great]","[Worked, great]","[(Worked, v), (great, a)]","[work, great]","[Worked, great]"
147191,A2Q066NZCQSCOR,Gift card,"[Gift, card]","[Gift, card]","[(Gift, n), (card, n)]","[gift, card]","[Gift, card]"
147192,A1KJLWCW7XBS8I,"What is there to say, It's a gift card.","[What, is, there, to, say, ,, It, 's, a, gift,...","[say, ,, 's, gift, card, .]","[(say, v), (,, n), ('s, n), (gift, n), (card, ...","[say, ,, 's, gift, card, .]","[say, ,, 's, gift, card, .]"
