In [4]:
%load_ext autoreload
%autoreload 2

# MISC
import gzip
import multiprocessing
import time
import json
from collections import defaultdict
from shared import *
cpu_cores = multiprocessing.cpu_count()
data_path_50MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Gift_Cards.json'
data_path_263MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Appliances.json'
data_path_755MB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Digital_Music.json'
data_path_2GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Video_Games.json'
data_path_10GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/reviews_Books_1.json'
data_path_38GB = '/Users/angelloparr/Documents/csc369/project.nosync/data_uncompressed/Books_big.json'


# NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Dask
import pandas as pd
import numpy as np
import dask
from dask import dataframe as dd
import dask.bag as db
dask.config.set(scheduler='processes', num_workers=cpu_cores)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<dask.config.set at 0x11ff2f400>

## Dask Operations

In [5]:
begin = time.time()
start = begin

reviews_dd = db.read_text(data_path_50MB).map(json.loads).to_dataframe()
reviews_dd = reviews_dd.repartition(npartitions=8)

print_elapsed(start)

## Drop unwanted columns / keep good ones
start = time.time()
columns = ['reviewerID', 'reviewText']
reviews_dd = reviews_dd[columns]
print_elapsed(start)

## Drop empty (NaN) rows
start = time.time()
reviews_dd = reviews_dd.dropna()
print_elapsed(start)

## Tokenize words in review
start = time.time()
reviews_dd['tokenized_dirty'] = reviews_dd['reviewText'].apply(nltk.word_tokenize, meta=('reviewText', 'object'))
print_elapsed(start)

## Remove stop words
start = time.time()
reviews_dd['tokenized_review'] = reviews_dd['tokenized_dirty'].apply(lambda tokens: [w for w in tokens if w.lower() not in stop_words], meta=('tokenized_dirty', 'object'))

## Perform Part-Of-Speech (POS) tagging on tokens
start = time.time()
reviews_dd['tok/pos'] = reviews_dd['tokenized_review'].apply(my_pos_tag, meta=('tok/pos', 'object'))
print_elapsed(start)

## Stem words in tokenized review
start = time.time()
reviews_dd['stemmed_review'] = reviews_dd['tokenized_review'].apply(lambda tokens: [stemmer.stem(t) for t in tokens], meta=('tokenized_review', 'object'))
print_elapsed(start)

## Lemmatize words in tokenized review
start = time.time()
reviews_dd['lemmatized_review'] = reviews_dd['tok/pos'].apply(lambda tokens: [lemmatizer.lemmatize(t, pos) for (t, pos) in tokens], meta=('tok/pos', 'object'))
print_elapsed(start)

final_dd = reviews_dd.head(5)

print(f'Total execution time: {time.time() - begin}')

Elapsed time for operation: 0.13904285430908203
Elapsed time for operation: 0.0007040500640869141
Elapsed time for operation: 0.0018210411071777344
Elapsed time for operation: 0.0014040470123291016
Elapsed time for operation: 0.0013570785522460938
Elapsed time for operation: 0.0017154216766357422
Elapsed time for operation: 0.001486063003540039
Total execution time: 14.231302976608276


In [6]:
final_dd

Unnamed: 0,reviewerID,reviewText,tokenized_dirty,tokenized_review,tok/pos,stemmed_review,lemmatized_review
0,APV13CM0919JD,"Amazon,\nI am shopping for Amazon.com gift car...","[Amazon, ,, I, am, shopping, for, Amazon.com, ...","[Amazon, ,, shopping, Amazon.com, gift, cards,...","[(Amazon, n), (,, n), (shopping, v), (Amazon.c...","[amazon, ,, shop, amazon.com, gift, card, chri...","[Amazon, ,, shop, Amazon.com, gift, card, Chri..."
1,A3G8U1G1V082SN,"I got this gift card from a friend, and it was...","[I, got, this, gift, card, from, a, friend, ,,...","[got, gift, card, friend, ,, best, !, site, mu...","[(got, v), (gift, a), (card, n), (friend, n), ...","[got, gift, card, friend, ,, best, !, site, mu...","[get, gift, card, friend, ,, best, !, site, mu..."
2,A11T2Q0EVTUWP,aren't we going to save trees?! :) People who ...,"[are, n't, we, going, to, save, trees, ?, !, :...","[n't, going, save, trees, ?, !, :, ), People, ...","[(n't, r), (going, v), (save, v), (trees, n), ...","[n't, go, save, tree, ?, !, :, ), peopl, compl...","[n't, go, save, tree, ?, !, :, ), People, comp..."
3,A9YKGBH3SV22C,You can always get someone something from Amaz...,"[You, can, always, get, someone, something, fr...","[always, get, someone, something, Amazon, safe...","[(always, r), (get, v), (someone, n), (somethi...","[alway, get, someon, someth, amazon, safeti, n...","[always, get, someone, something, Amazon, safe..."
4,A34WZIHVF3OKOL,Why take 50 dollars of good money with no limi...,"[Why, take, 50, dollars, of, good, money, with...","[take, 50, dollars, good, money, limitations, ...","[(take, v), (50, n), (dollars, n), (good, a), ...","[take, 50, dollar, good, money, limit, ,, turn...","[take, 50, dollar, good, money, limitation, ,,..."
