# Dirty Exploration

In [1]:
import pandas as pd
import os
import numpy as np
from datetime import date as dt
import re

In [2]:
np.random.seed(103)

## Data Load

In [3]:
colname = {
    "reviewerID": "_id", 
    "asin": "asin", 
    "reviewerName": "reviewer_name", 
    "helpful": "helpful",
    "reviewText": "review_text",
    "overall": "overall",
    "summary": "summary",
    "unixReviewTime": "unix_review_time",
    "reviewTime": "review_time"
}
df = pd.read_json("../datalake/processed/version_2020-04-04.json").rename(columns=colname)

In [4]:
df.head()

Unnamed: 0,_id,asin,reviewer_name,helpful,review_text,overall,summary,unix_review_time,review_time
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5,No more pops when I record my vocals.,1392940800,"02 21, 2014"


## Null Check and Length Check

In [5]:
df["review_text"].isnull().any()

False

In [6]:
df["text_len"] = df["review_text"].apply(lambda x: len(x))

In [7]:
df = df[df["text_len"] != 0]

In [8]:
df["text_len"].describe()

count    10254.000000
mean       486.260776
std        613.514024
min          9.000000
25%        163.000000
50%        285.000000
75%        552.000000
max      11310.000000
Name: text_len, dtype: float64

In [9]:
def preprocessor(text):
    """Removing http link and non word character at the beginning"""
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [10]:
df["review_text"] = df["review_text"].apply(preprocessor)

In [11]:
df.head()

Unnamed: 0,_id,asin,reviewer_name,helpful,review_text,overall,summary,unix_review_time,review_time,text_len
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",not much to write about here but it does exact...,5,good,1393545600,"02 28, 2014",268
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",the product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013",544
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",the primary job of this device is to block the...,5,It Does The Job Well,1377648000,"08 28, 2013",436
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",nice windscreen protects my mxl mic and preven...,5,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",206
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",this pop filter is great it looks and performs...,5,No more pops when I record my vocals.,1392940800,"02 21, 2014",159


## Freaquency Check and Upsampling the Data

In [12]:
def prune_rating(rating):
    if rating < 3:
        return 0
    elif rating > 3:
        return 2
    else:
        return 1
df["pruned_rating"] = df["overall"].apply(lambda x: prune_rating(x))

In [13]:
frequency = df.groupby(["pruned_rating"]).size().to_dict()
max_class = max(frequency, key=frequency.get)
max_value = frequency.get(max_class)

In [12]:
frequency

{0: 467, 1: 772, 2: 9015}

In [14]:
upsample_list = []
for key, value in frequency.items():
    if key == max_class:
        upsample_list.append(df[df["pruned_rating"] == key])
    else:
        upsample_list.append(
            df[df["pruned_rating"] == key].sample(max_value, replace=True, random_state=103)
        )
upsampled_df = pd.concat(upsample_list)
upsampled_df = upsampled_df.reset_index(drop=True)
upsampled_df = upsampled_df.reindex(np.random.permutation(upsampled_df.index))

In [15]:
upsampled_df.head(10)

Unnamed: 0,_id,asin,reviewer_name,helpful,review_text,overall,summary,unix_review_time,review_time,text_len,pruned_rating
26115,A1ZRXGT8QJXGET,B005FKF3L6,"Bob Denton ""BDAZ""","[0, 0]",easily installed on my parlor guitar the oval ...,5,Well made and good price,1382313600,"10 21, 2013",127,2
21097,AOY459LVUBKLO,B0002OOMW6,Confidential Name,"[2, 2]",so far still made in america i have been using...,5,Great Product!,1390521600,"01 24, 2014",584,2
1924,A34WEXT7SIRFE4,B000ULAP4U,@ciscoza,"[9, 14]",these cans were decent back when they were jus...,2,Really harsh and uncomfortable.,1328227200,"02 3, 2012",1158,0
20106,A2N7F3MVCTAOYP,B0002GLCRC,Ronnie Roper,"[0, 0]",i had purchased one a while back for my electr...,5,Great sturdy guitar hanger,1346284800,"08 30, 2012",202,2
26836,A1LH6RF4UN9VI6,B00B1N06PO,"E. D. Garcia ""Biggie-E""","[6, 7]",i ve tried these out in head to head excuse th...,5,A Good Value!,1365897600,"04 14, 2013",717,2
16461,A3MMISYL1GBXH4,B0002GYW4C,D.T.,"[0, 0]",compared to ernie ball s cloth it s good enoug...,3,Nice cloth,1395273600,"03 20, 2014",226,1
16767,A3RFWANBUVX4RA,B000JRPYGE,Arik Burke,"[1, 5]",when given a budget for cheap microphones the ...,3,"its pretty, but not great",1364774400,"04 1, 2013",673,1
5053,AKHWZ3S1UVZAO,B0002D0CEO,Hagen LeBray,"[0, 0]",i ve been playing guitar for over 40 years and...,2,"Not the Worst, but Close",1402358400,"06 10, 2014",232,0
1610,A2F92AOWTIUIB,B0002H04NE,StudioDude,"[0, 0]",through my ampeg bass amp the low e string get...,2,"Low E string gets fuzzy fast, Other strings bl...",1394755200,"03 14, 2014",143,0
20590,A3G40OIPT4I040,B0002H0A3S,Hyun,"[0, 0]",far as i can tell good strings they do the job...,5,good strings,1396483200,"04 3, 2014",118,2


In [16]:
upsampled_df.groupby(["pruned_rating"]).size().to_dict()

{0: 9015, 1: 9015, 2: 9015}

In [16]:
(
    upsampled_df[["_id", "review_text", "overall"]]
    .to_csv(
        "../datalake/feed/version_{}.csv".format(str(dt.today())), 
        index=False, 
        encoding='utf-8'
    )
)

# Model Training

In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_md')

In [20]:
def tokenizer_lemma(sentence):
    return [token.lemma_ for token in nlp(sentence)]

In [21]:
def tokenizer(sentence):
    return text.split()

In [24]:
stop = list(spacy.lang.en.stop_words.STOP_WORDS)

## Load the Data

# Test Code

#### Bag Of Words

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
count = CountVectorizer()

In [20]:
docs = np.array(['The sun is shining',
'The weather is sweet',
'The sun is shining, the weather is sweet,'
'and one and one is two'])

In [22]:
bag = count.fit_transform(docs)

In [24]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [25]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]])

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

In [28]:
np.set_printoptions(precision=2)

In [29]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [30]:
## Token and Lemma

In [32]:
import spacy
nlp = spacy.load('en_core_web_md')

In [45]:
about_text = ('Gus Proto was a Python developer while ago. But now going to be a rockstars')

In [46]:
about_doc = nlp(about_text)

In [43]:
list(about_doc.sents)

[Gus Proto was a Python developer while ago., But now going to be a rockstar]

In [48]:
def tokenizer(sentence):
    doc = nlp(sentence)
    return [token.lemma_ for token in doc]

In [50]:
tokenizer("A letter has been written, asking him to be released!")

['a',
 'letter',
 'have',
 'be',
 'write',
 ',',
 'ask',
 '-PRON-',
 'to',
 'be',
 'release',
 '!']