In [1]:
import pandas as pd
from time import perf_counter
import numpy as np

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ivy_x\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Single Process

## load in file

In [3]:
start=perf_counter()
review20=pd.read_json("sampled30k.json",lines=True)
elapsed_time=perf_counter()-start
print(f'Finished in {elapsed_time:.2f} sec')

Finished in 0.37 sec


In [4]:
review20.head()

Unnamed: 0,stars,text
0,5,I went here today to upgrade to an iPhone X. T...
1,5,I went here yesterday and I got the best servi...
2,5,These are sooo good! I'm having Bulgogi dog c...
3,1,"First time buyer,\nI was excited to go in buy ..."
4,5,Oh my gosh.. BOMBBBBBBBB!!! This place is so a...


In [27]:
review20.shape
# 360965 90%


(30163, 2)

In [5]:
review20['sentiment'] = np.where(review20['stars']<5, 0, 1)

In [6]:
review20.drop(columns=['stars'],inplace=True)

## pre-processing

In [7]:
# remove stop words
# lemmatize
import re
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']


def normalize(text):
    tokens = re.findall('[a-zA-Z\']+', text)
    tokens_without_stop_words = [lemmatizer.lemmatize(i.lower(), pos='v') for i in tokens if i.lower() not in stop_words]
    return tokens_without_stop_words



In [8]:
# sample_size=30000
# review20_sampled=review20.truncate(before=0,after=sample_size-1)

In [8]:
sample_size=30000
review20_sampled=review20.copy()

In [9]:
review20_sampled.head()

Unnamed: 0,text,sentiment
0,I went here today to upgrade to an iPhone X. T...,1
1,I went here yesterday and I got the best servi...,1
2,These are sooo good! I'm having Bulgogi dog c...,1
3,"First time buyer,\nI was excited to go in buy ...",0
4,Oh my gosh.. BOMBBBBBBBB!!! This place is so a...,1


In [10]:
def pre_process(data):
    data.text=data.text.apply(lambda x: normalize(x))
    return(data)

In [11]:
start=perf_counter()
# review20.text=review20.text.apply(lambda x: normalize(x))
review20_sampled=pre_process(review20_sampled)
elapsed_time=perf_counter()-start
print(f'Finished in {elapsed_time:.2f} sec')

Finished in 15.28 sec


## training

In [12]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [13]:
# from nltk.corpus import movie_reviews

from itertools import chain
from math import floor

start=perf_counter()

documents=[tuple(x) for x in review20_sampled.values]

print('finish truncating')

all_word=review20_sampled.text.values.tolist()

all_text_list=list(chain(*all_word))

all_words = nltk.FreqDist(all_text_list)

word_features = list(all_words)[:2000]

featuresets = [(document_features(d), c) for (d,c) in documents]
print("finish feature engineering")
train_set, test_set = featuresets[:floor(0.9*sample_size)], featuresets[floor(0.9*sample_size):]
print('finish data split')
classifier = nltk.NaiveBayesClassifier.train(train_set)
print('finish training')
elapsed_time=perf_counter()-start
print(f'Finished in {elapsed_time:.2f} sec')

finish truncating
finish feature engineering
finish data split
finish training
Finished in 158.84 sec


# Test

In [14]:
# Test the classifier
start=perf_counter()
print(nltk.classify.accuracy(classifier, test_set))
elapsed_time=perf_counter()-start
print(f'Finished in {elapsed_time:.2f} sec')

0.7944988934555801
Finished in 24.24 sec
