In [24]:
# This code is for my NLP Udemy class, which can be found at:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python
# It is written in such a way that tells a story.
# i.e. So you can follow a thought process of starting from a
# simple idea, hitting an obstacle, overcoming it, etc.
# i.e. It is not optimized for anything.

# Author: http://lazyprogrammer.me
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

#nltk.download('wordnet')
#nltk.download('punkt')
wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# note: an alternative source of stopwords
# from nltk.corpus import stopwords
# stopwords.words('english')

# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')



# first let's just try to tokenize the text using nltk's tokenizer
# let's take the first review for example:
# t = positive_reviews[0]
# nltk.tokenize.word_tokenize(t.text)
#
# notice how it doesn't downcase, so It != it
# not only that, but do we really want to include the word "it" anyway?
# you can imagine it wouldn't be any more common in a positive review than a negative review
# so it might only add noise to our model.
# so let's create a function that does all this pre-processing for us

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t.strip() for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens


# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle the data and create train/test splits
# try it multiple times!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))


# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)



len(word_index_map): 10950
Train accuracy: 0.7810526315789473
Test accuracy: 0.78
unit -0.7383481336589115
bad -0.7877322804017851
cable 0.6386369703702667
time -0.6740185682653058
've 0.8824088083069019
month -0.7478166531317771
pro 0.5036887443666799
sound 0.9850423940500005
lot 0.6955088202129139
you 0.9644546487128478
n't -2.0234639738923597
easy 1.7361451696128114
quality 1.4807461752637985
company -0.5662423179919792
card -0.6765186326910336
item -0.973797457083548
wa -1.3798095607528533
perfect 0.9801152999885936
fast 0.9051039105582919
ha 0.7325330363125054
price 2.7196411311275184
value 0.5664022882754983
money -1.0024221001766995
memory 1.000372700386747
buy -0.9085934234621497
bit 0.6380038583672167
happy 0.6261369750196814
pretty 0.7037870093441773
doe -1.2402559341728723
highly 1.0540580041388727
recommend 0.6659907525484348
customer -0.6675459071894297
support -0.9417748045072738
little 0.988410256696512
returned -0.7907801047397637
excellent 1.3901758671072668
love 1.190

In [4]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(), features="html5lib")
positive_reviews

<html><head></head><body><review>
<unique_id>
B00006HYUB:everyone_should_own_one:d._john_"looser"
</unique_id>
<asin>
B00006HYUB
</asin>
<product_name>
APC Back-UPS ES 500 Backup Battery and Surge Protector: Electronics
</product_name>
<product_type>
electronics
</product_type>
<helpful>
3 of 3
</helpful>
<rating>
5.0
</rating>
<title>
Everyone should own one
</title>
<date>
July 31, 2006
</date>
<reviewer>
D. John "Looser"
</reviewer>
<reviewer_location>
PA
</reviewer_location>
<review_text>
I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.

As always, Amazon had it to me in &lt;2 business 

In [5]:
positive_reviews = positive_reviews.findAll('review_text')
positive_reviews

[<review_text>
 I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.
 
 I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.
 
 As always, Amazon had it to me in &lt;2 business days
 </review_text>,
 <review_text>
 I ordered 3 APC Back-UPS ES 500s on the recommendation of an employee of mine who used to work at APC. I've had them for about a month now without any problems. They've functioned properly through a few unexpected power interruptions. I'll gladly order more if the need arises.
 
 Pros:
  - Large plug spacing, good for power adapters
  - Simple design
  - Long cord
 
 Cons:
  - No line conditioning (usually an expensive option
 </review_

In [7]:
positive_tokenized

[['purchased',
  'this',
  'unit',
  'due',
  'frequent',
  'blackout',
  'power',
  'supply',
  'bad',
  'run',
  'cable',
  'modem',
  'router',
  'lcd',
  'monitor',
  'minute',
  'this',
  'time',
  'save',
  'shut',
  'equally',
  'electronics',
  'receiving',
  'clean',
  'power',
  'feel',
  'this',
  'investment',
  'minor',
  'compared',
  'loss',
  'valuable',
  'data',
  'failure',
  'equipment',
  'due',
  'power',
  'spike',
  'irregular',
  'power',
  'supply',
  'amazon',
  'business',
  'day'],
 ['apc',
  'back-ups',
  '500',
  'recommendation',
  'employee',
  'mine',
  'apc',
  "'ve",
  'month',
  "'ve",
  'functioned',
  'properly',
  'unexpected',
  'power',
  'interruption',
  "'ll",
  'gladly',
  'arises',
  'pro',
  'plug',
  'spacing',
  'power',
  'adapter',
  'simple',
  'design',
  'cord',
  'con',
  'line',
  'conditioning',
  'usually',
  'expensive',
  'option'],
 ['wish',
  'unit',
  'separate',
  'online/offline',
  'light',
  'power',
  'unit',
  'missi

In [18]:
tokens

['bought',
 'this',
 'easy',
 'transfer',
 'picture',
 'digital',
 'camera',
 'memory',
 'card',
 'home',
 'sometimes',
 'people',
 'memory',
 'card',
 'memory',
 'stick',
 'wa',
 'disappointed',
 'flimsy',
 'plastic',
 'design',
 'size',
 'doesnt',
 'read',
 'card',
 'menetioned',
 'people',
 'review',
 'hard',
 'insert',
 'card',
 'scared',
 'card',
 'scratch',
 'ruined',
 'whenever',
 'wish',
 'bought',
 'this',
 'reading',
 'amazon',
 'review',
 '...',
 'useless',
 'lost',
 'card',
 'this',
 'scared',
 'frequent',
 'flimsy',
 'design']

In [6]:
word_index_map

{'purchased': 0,
 'this': 1,
 'unit': 2,
 'due': 3,
 'frequent': 4,
 'blackout': 5,
 'power': 6,
 'supply': 7,
 'bad': 8,
 'run': 9,
 'cable': 10,
 'modem': 11,
 'router': 12,
 'lcd': 13,
 'monitor': 14,
 'minute': 15,
 'time': 16,
 'save': 17,
 'shut': 18,
 'equally': 19,
 'electronics': 20,
 'receiving': 21,
 'clean': 22,
 'feel': 23,
 'investment': 24,
 'minor': 25,
 'compared': 26,
 'loss': 27,
 'valuable': 28,
 'data': 29,
 'failure': 30,
 'equipment': 31,
 'spike': 32,
 'irregular': 33,
 'amazon': 34,
 'business': 35,
 'day': 36,
 'apc': 37,
 'back-ups': 38,
 '500': 39,
 'recommendation': 40,
 'employee': 41,
 'mine': 42,
 "'ve": 43,
 'month': 44,
 'functioned': 45,
 'properly': 46,
 'unexpected': 47,
 'interruption': 48,
 "'ll": 49,
 'gladly': 50,
 'arises': 51,
 'pro': 52,
 'plug': 53,
 'spacing': 54,
 'adapter': 55,
 'simple': 56,
 'design': 57,
 'cord': 58,
 'con': 59,
 'line': 60,
 'conditioning': 61,
 'usually': 62,
 'expensive': 63,
 'option': 64,
 'wish': 65,
 'separate':

In [9]:
orig_reviews

['\nNOTE TO EVERYONE WHO IS HAVING PROBLEMS WITH THIS CARD--I reformatted it in my camera and it works fine now!!\nInitially I wrote:\nI was very disappointed in the shot-to-shot speed with this card (using a Minolta Xg).  The Panasonic card that came with the camera, which I thought would be cheap, is THREE TIMES FASTER than this "upgrade".  This card is useless to me as it cripples the camera I bought it to work in.  It takes FOURTEEN SECONDS from one shot to the next with this card.  With the original card, it\'s less than four seconds.  The claim that this is a "high speed" card is dishonest.  This is the slowest card I\'ve ever used.  I will throw this card away rather than curse someone else with it. \n',
 "\nI have been going nuts trying to find a decent set of headphones for my new iPod.  All I wanted was decent sound for a decent price, but after trying various models of ear-buds and standard phones and returning them all I figured I was going to have to shell out some big buc

In [13]:
rrr=np.zeros((5,6))
rrr

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [14]:
data

array([[0.        , 0.15686275, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.03333333, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.05154639, 0.01030928, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.025     , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.04      , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [15]:
X

array([[0.        , 0.15686275, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03333333, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.05154639, 0.01030928, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.025     , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04      , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [16]:
Ytrain

array([0., 1., 0., ..., 1., 1., 1.])

In [23]:
s='then '
s.strip()

'then'

In [25]:
KS=my_tokenizer("Don't Do This if you like it. Don't ever Don't eat because Do Not is not a Do and DO because Don't")
KS

["n't", 'this', 'you', "n't", "n't", 'eat', "n't"]

In [26]:
kd=nltk.tokenize.word_tokenize("Don't Do This if you like it. Don't ever Don't eat because Do Not is not a Do and DO because Don't")
kd

['Do',
 "n't",
 'Do',
 'This',
 'if',
 'you',
 'like',
 'it',
 '.',
 'Do',
 "n't",
 'ever',
 'Do',
 "n't",
 'eat',
 'because',
 'Do',
 'Not',
 'is',
 'not',
 'a',
 'Do',
 'and',
 'DO',
 'because',
 'Do',
 "n't"]

In [204]:
### My Code 

In [249]:

import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#nltk.download('wordnet')
#nltk.download('punkt')
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

#corpus = []

# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')
stopwords=set(stopwords.words('english'))

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t.strip() for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    #tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens=set(tokens)
    tokens = [ps.stem(word) for word in tokens ]
    tokens = [t for t  in tokens if not t in stopwords]
    #tokens = [ps.stem(word) for word in tokens]
    #tokens = [t for t  in tokens if not t in set(stopwords.words('english'))]
    
    tokens = ' '.join(tokens)
    #corpus.append(tokens)
    return tokens

#word_index_map = {}
current_index = 0

import pandas as pd
dataset = pd.DataFrame(columns=['review', 'token'])
orig_reviews = []
counter=0

for review in positive_reviews:
    
    tokens = my_tokenizer(review.text)
    orig_reviews.append([tokens,1])
    #dataset.loc[counter]=[tokens,1]
    #counter=counter+1


for review in negative_reviews:
    
    tokens = my_tokenizer(review.text)
    orig_reviews.append([tokens,0])
    #dataset.loc[counter]=[tokens,0]
    #counter=counter+1

dataset=pd.DataFrame(orig_reviews, columns=['review', 'token'])
dataset = shuffle(dataset)
dataset






Unnamed: 0,review,token
1808,belkin come listen like good transmitt never a...,0
916,would broken advert bit like good thi back hea...,1
1469,buy devic would devic seem charger thi better ...,0
942,applic well would purchas tri perfectli sole f...,1
1883,steer game good slightest call clear subwoof p...,0
...,...,...
1379,onli becaus wear day anyway real walk caus lik...,0
112,blank need veri anyon product arriv good condi...,1
46,onli digit would len held buckl like think sec...,1
1732,oper would differ peopl special conveni confus...,0


In [256]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(dataset.iloc[:, 0].values).toarray()
y = dataset.iloc[:, 1].values
y=y.astype('int')


#from sklearn.decomposition import TruncatedSVD
#svd = TruncatedSVD()
#X = svd.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
    
model = LogisticRegression()
model.fit(X_train, y_train)
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


TypeError: 'module' object is not callable

In [251]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:11<00:00,  2.38s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.83,0.83,0.83,0.83,0.3
NearestCentroid,0.82,0.82,0.82,0.82,0.21
XGBClassifier,0.81,0.81,0.81,0.81,7.33
NuSVC,0.81,0.81,0.81,0.81,8.07
ExtraTreesClassifier,0.81,0.81,0.81,0.81,2.51
RandomForestClassifier,0.8,0.8,0.8,0.8,1.36
LGBMClassifier,0.79,0.79,0.79,0.79,0.78
SVC,0.79,0.79,0.79,0.79,7.53
LogisticRegression,0.78,0.78,0.78,0.78,0.38
CalibratedClassifierCV,0.78,0.78,0.78,0.78,17.51


In [206]:
import pandas as pd
da = pd.DataFrame(columns=['review', 'token'])
da.loc[0]=[1,2]
da

Unnamed: 0,review,token
0,1,2
