In [1]:
# importing the libraries 

import nltk
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
# converting the words into the their base form by using the WordNetLemmatizer from nltk
word_lemmatizer = WordNetLemmatizer()
# removing the useless words which give us no inference, the words are present in th stopwords.txt
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

#using BeautifulSoup to parse through the Xml data and creating the positive reviews section
postive_reviews = BeautifulSoup(open('electronics/positive.review').read())
# only using the review text from the postive_reviews file
postive_reviews = postive_reviews.findAll('review_text')

# using BeautifulSoup to parse through the XML data and creating the negative_reviews
negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
# only using the review text from the negative_reviews file 
negative_reviews = negative_reviews.findAll('review_text')

In [3]:
# we need to ensure that there are equal no. of positive and negative reviews 
# so we need to shuffle the postive reviews and cut the extra 

# shuffling the positive reviews 
np.random.shuffle(postive_reviews)
# ensuring that the size of postive_reviews and negative_reviews is the same 
postive_reviews = postive_reviews[: len(negative_reviews)]


In [4]:
# finding out the number of words and what all words we have in our document 

# building our tokenizer 
def my_tokenizer(s):
    # firstly lower case the entire string so that capitlized and non capitalized words are not different
    s = s.lower()
    # seperating the words in the string and putting them in an array
    tokens = nltk.tokenize.word_tokenize(s)
    # removing all the words which are shorter than 2 letters because they obviously don't make any meaning
    tokens = [t for t in tokens if len(t)>2]
    # lemmatizing the words
    tokens = [word_lemmatizer.lemmatize(t) for t in tokens]
    # removing the stopwords from our dictionary 
    tokens = [t for t in tokens if t not in stopwords]
    # returning the final array
    return tokens 
    
    
word_index_map = {} # tells us about what all words we have 
current_index = 0 # tells us about the number of words we have 

# storing positive and negative tokenized strings in arrays 
postive_tokenized = []
negative_tokenized = []

# now we need to tokenize the string
# that is convert words in a string to words in an array 

# tokenizing the positive reviews and adding the words into the dictionary 
for review in postive_reviews:
    tokens = my_tokenizer(review.text)
    postive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
# tokenizing the negative reviews and adding the words into the dictionary 
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
print(word_index_map)



In [5]:
# now we have to convert the tokens into vectors
# so that we can run we can use machine learning over it 

def tokens_to_vectors(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x/ x.sum()
    x[-1] = label
    return x 
N = len(postive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map)+1))
i = 0
for tokens in postive_tokenized:
    xy = tokens_to_vectors(tokens, 1)
    data[i,:] = xy 
    i+=1
    
for tokens in negative_tokenized:
    xy = tokens_to_vectors(tokens, 0)
    data[i, :] = xy
    i+=1
    
np.random.shuffle(data

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [6]:
# making the machine learning model 

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:", model.score(Xtest, Ytest))



ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0