In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.utils import resample
from string import punctuation
import spacy
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
reviews_raw = pd.read_json('reviews_Sports_And_Outdoors_5.JSON', lines=True)
pd.DataFrame.from_dict(reviews_raw, orient='columns')
reviews_raw.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1881509818,"[0, 0]",5,This came in on time and I am veru happy with ...,"01 26, 2014",AIXZKN4ACSKI,David Briner,Woks very good,1390694400
1,1881509818,"[1, 1]",5,I had a factory Glock tool that I was using fo...,"02 2, 2012",A1L5P841VIO02V,Jason A. Kramer,Works as well as the factory tool,1328140800
2,1881509818,"[2, 2]",4,If you don't have a 3/32 punch or would like t...,"02 28, 2012",AB2W04NI4OEAD,J. Fernald,"It's a punch, that's all.",1330387200
3,1881509818,"[0, 0]",4,This works no better than any 3/32 punch you w...,"02 5, 2012",A148SVSWKTJKU6,"Jusitn A. Watts ""Maverick9614""",It's a punch with a Glock logo.,1328400000
4,1881509818,"[0, 0]",4,I purchased this thinking maybe I need a speci...,"04 23, 2013",AAAWJ6LW9WMOO,Material Man,"Ok,tool does what a regular punch does.",1366675200
5,1881509818,"[0, 0]",5,"Needed this tool to really break down my G22, ...","11 2, 2012",A2XX2A4OJCDNLZ,RatherLiveInKeyWest,Glock punch tool - needed for your Glock and o...,1351814400
6,1881509818,"[0, 0]",5,If u don't have it .. Get it. All you need to ...,"06 10, 2014",A283UOBQRUNM4Q,Thomas Dragon,Great tool,1402358400
7,2094869245,"[0, 0]",4,This light will no doubt capture the attention...,"08 31, 2013",AWG3H90WVZ0Z1,Alec Nelson,Bright!,1377907200
8,2094869245,"[0, 1]",5,"Light and laser torch work well, very bright. ...","05 27, 2013",A3V52OTJHKIJZX,"A. Saenz Jr. ""Bettering self""",Be seen,1369612800
9,2094869245,"[0, 0]",5,Does everything it says it will do. I would li...,"11 2, 2013",A3SZBE5F3UQ9EC,"ChasRat ""ChasRat""",Bicycle rear tail light,1383350400


In [3]:
null_count = reviews_raw.isnull().sum()
null_count[null_count>0]

reviewerName    1402
dtype: int64

In [4]:
df2 = reviews_raw.drop(['reviewerID', 'reviewerName', 'asin', 'helpful', 'reviewTime', 'summary', 
                       'unixReviewTime'], 1)
threshold = 4
df2['Sentiment'] = np.where(df2['overall'] >= threshold, 1,0)
df2 = df2.drop(['overall'], 1)

df2.head(10)

Unnamed: 0,reviewText,Sentiment
0,This came in on time and I am veru happy with ...,1
1,I had a factory Glock tool that I was using fo...,1
2,If you don't have a 3/32 punch or would like t...,1
3,This works no better than any 3/32 punch you w...,1
4,I purchased this thinking maybe I need a speci...,1
5,"Needed this tool to really break down my G22, ...",1
6,If u don't have it .. Get it. All you need to ...,1
7,This light will no doubt capture the attention...,1
8,"Light and laser torch work well, very bright. ...",1
9,Does everything it says it will do. I would li...,1


In [5]:
df2['Sentiment'].value_counts()

1    253017
0     43320
Name: Sentiment, dtype: int64

In [6]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

for review in df2['reviewText']:
    review = text_cleaner(review)

df2.head(10)

Unnamed: 0,reviewText,Sentiment
0,This came in on time and I am veru happy with ...,1
1,I had a factory Glock tool that I was using fo...,1
2,If you don't have a 3/32 punch or would like t...,1
3,This works no better than any 3/32 punch you w...,1
4,I purchased this thinking maybe I need a speci...,1
5,"Needed this tool to really break down my G22, ...",1
6,If u don't have it .. Get it. All you need to ...,1
7,This light will no doubt capture the attention...,1
8,"Light and laser torch work well, very bright. ...",1
9,Does everything it says it will do. I would li...,1


In [7]:
positive = df2.loc[df2['Sentiment'] == 1]
negative = df2.loc[df2['Sentiment'] == 0]
df3 = resample(positive, n_samples=433)
negative = resample(negative, n_samples=433)
df3 = df3.append(negative)
df3['Sentiment'].value_counts()

1    433
0    433
Name: Sentiment, dtype: int64

In [8]:
#Turn the data sets into massive lists of words
positivelist = []
negativelist = []

positivelist = positive['reviewText'].astype(str).values.tolist()
negativelist = negative['reviewText'].astype(str).values.tolist()
print(positivelist[0:4])

['This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy', "I had a factory Glock tool that I was using for my Glock 26, 27, and 17.  I've since lost it and had needed another.  Since I've used Ghost products prior, and know that they are reliable, I had decided to order this one.  Sure enough, this is just as good as a factory tool.", "If you don't have a 3/32 punch or would like to have one in your Glock bag, this is okay.  The butt end of it is handy for pushing pins back in place.  If you already have a 3/32 punch and don't need another, don't both with this one.", 'This works no better than any 3/32 punch you would find at the hardware store. Actually, I think you would be better with a regular punch as it has more to hold on to.']


In [9]:
X_trainp, X_testp = train_test_split(positivelist, test_size=0.4)
X_trainn, X_testn = train_test_split(negativelist, test_size=0.4)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
positive_tfidf=vectorizer.fit_transform(positivelist)
print("Number of features: %d" % positive_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(positive_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
print(n)
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_trainp[0:1])
print('Tf_idf vector:', tfidf_bypara[0:1])

Number of features: 51833
151810
Original sentence: ['I have not used it against anyone but I tested it a few times and it scared the heck out of my wife and dogs.  Maybe just the sound would keep someone away.  I thought of using it while bike riding to deter dogs from chasing me... maybe the sound would stop them.  (I would not use it on an animal.)']
Tf_idf vector: [{'certian': 0.54241160670552235, 'airsoft': 0.28209902523397412, 'jam': 0.33101570999819041, 'loader': 0.28300663621469097, 'loads': 0.30044673643482611, 'speed': 0.23459692700268572, 'guns': 0.22930529222487431, 'dont': 0.2469914447103751, 'easier': 0.20354723863647481, 'hand': 0.17908549224481587, 'way': 0.1580494137556413, 'hold': 0.17280755705739245, 'works': 0.13726697996021148, 'know': 0.17455256351304402}]


In [None]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(260)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
positive_by_component=pd.DataFrame(X_train_lsa,index=X_trainp)
for i in range(5):
    print('Component {}:'.format(i))
    print(positive_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 26.4931487158
Component 0:
I bought this from a local Army-Navy store to take on a canoeing/camping trip in Arkansas.  I read the reviews on Amazon, but bought it at the store cause it was a couple bucks cheaper.  The main reason I bought this set is because it's stainless, not aluminum.  Several of my camping buddies used aluminum and had a hard time cleaning it, especially cooking over open flames.  This set is also compact, and includes a cup with measurements so you know how much water to add to hot cocoa, oatmeal, etc.  Folds up nice, and fits in my pack.  There's enough room to put my utensils in it as well, so it's all together.  Lightweight, but also durable.  Some other sets have really weak handles, but this one is pretty strong.  The only thing I wish would be better is that when it's put up it would fit tighter and not bang around as much.  I didn't hike with it, so it wasn't that big of an issue.  All in all, a really nice set. 

Not only is this lantern darling, but BOY does it ever put out the lighting for you. I could not believe it.This little tiny LED light is a Power House. No kidding! In pitch dark, this little thing lights up the entire room.AND, it's a very big room too! No, not like a light bulb, but enough to see everything, I was going to put itin my purse, but now it's on my nightstand, and it's going to stay there. I will get another one too. I like it,You will too.    0.541735
extremely light weight and durable with a rubber mechanism to help secure your bottle in place over those pot holes.Easy in-out for bottles.                                                                                                                                                                                                                                                                                                                                  0.541735
The screw on chuck is the highlight of this pump. While it

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])


In [None]:
"""nlp = spacy.load('en')
positive = df3.loc[df3['Sentiment'] == 1]
negative = df3.loc[df3['Sentiment'] == 0]
positive.reviewText.to_csv('positive.txt', sep='\t', index=None)
positive = open('positive.txt', 'w')

negative.reviewText.to_csv('negative.txt', sep='\t', index=None)
negative = open('negative.txt', 'w')"""


In [None]:
"""
l = len(df3)
df3.index = range(l)
print(df3)
nlp = spacy.load('en')
def parsing_func(review):
    l = len(review)
    for x in range(l):
        review.loc[x] = nlp(review.loc[x])
parsing_func(df3['reviewText'])

def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
allwords = [word_frequencies(x) for x in df3['reviewText']]
allwords = [item[0] for item in Counter(allwords).most_common(2000)]
print(allwords)"""

In [None]:
"""# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    allwords = ''.join([c for c in text if c not in punctuation])
    reviews = allwords.split('\n')
    
    allwords = ' '.join(reviews)
    allwords = allwords.split()
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    print(allwords)
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
positivewords = bag_of_words(positive)
negativewords = bag_of_words(negative)

# Combine bags to create a set of unique words.
common_words = set(positivewords + negativewords)
print(common_words)"""

In [None]:
"""# Create our data frame with features. This can take a while to run.
word_counts = bow_features(alltexts, common_words)
word_counts.head()"""