# NLP Sentiment Analysis Practice

In [6]:
# nltk imports
import nltk

In [7]:
import string
import numpy as np
import os

basePath = os.path.abspath('') + "\\sentiment-analysis-nlp\\"
basePath

'D:\\projects\\sentiment-analysis-nlp\\'

In [8]:
file = basePath + "NLP\\sentiment_labelled_sentences\\full_set.txt"
with open(file) as f:
    content = f.readlines()

content[0:10]

['So there is no way for me to plug it in here in the US unless I go by a converter.\t0\n',
 'Good case, Excellent value.\t1\n',
 'Great for the jawbone.\t1\n',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!\t0\n',
 'The mic is great.\t1\n',
 'I have to jiggle the plug to get it to line up right to get decent volume.\t0\n',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.\t0\n',
 'If you are Razr owner...you must have this!\t1\n',
 'Needless to say, I wasted my money.\t0\n',
 'What a waste of money and time!.\t0\n']

## Preprocessing

### 1. Extracting sentences and creating labels from dataset

In [9]:
# Removing white spaces
content = [x.strip() for x in content]

# Separating sentences from labels
sentences = [x.split("\t")[0] for x in content]
labels = [x.split("\t")[1] for x in content]

In [10]:
sentences[0:10]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.']

In [11]:
labels[0:10]

['0', '1', '1', '0', '1', '0', '0', '1', '0', '0']

In [12]:
'''
Transforming the labels to go from -1 to 1, instead of 0 to 1
-1 represents negative, +1 represents positive
'''

y = np.array(labels, dtype='int8')
y = 2*y - 1
y

array([-1,  1,  1, ..., -1, -1, -1], dtype=int8)

### 2. Removing extras - stopwords, digits, punctuation

In [13]:
def remove_elements(x, removal_list):
    for z in removal_list:
        x = x.replace(z, ' ')
    return x

# Removing digits
digit_list = [str(x) for x in range(10)]
digits_removed = [remove_elements(x, digit_list) for x in sentences]

# Removing punctuations
punctuations_removed = [remove_elements(x, string.punctuation) for x in digits_removed]

# Converting to lower case and removing whitespaces
sentences = [x.lower() for x in punctuations_removed]
sentences = [x.strip() for x in sentences]

# Removing stopwords
def remove_stopwords(stopword, text):
    new_text = ' '.join([word for word in text.split() if word not in stopword])
    return new_text

# Defining our own set of stopwords
stop_set = ['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']
preprocessed = [remove_stopwords(stop_set, x) for x in sentences]
preprocessed[0:10]

['so there is no way for me plug in here in us unless go by converter',
 'good case excellent value',
 'great for jawbone',
 'tied charger for conversations lasting more than minutes major problems',
 'mic is great',
 'have jiggle plug get line up right get decent volume',
 'if you have several dozen or several hundred contacts then imagine fun sending each them one by one',
 'if you are razr owner you must have this',
 'needless say wasted my money',
 'what waste money and time']

### Stemming (optional)

In [14]:
def porter_stemmer(words):
    porter = nltk.PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    return stemmed

stemmed_sentences = [porter_stemmer(words.split()) for words in preprocessed]
stemmed_sentences = [" ".join(i) for i in stemmed_sentences]
stemmed_sentences[0:10]

['so there is no way for me plug in here in us unless go by convert',
 'good case excel valu',
 'great for jawbon',
 'tie charger for convers last more than minut major problem',
 'mic is great',
 'have jiggl plug get line up right get decent volum',
 'if you have sever dozen or sever hundr contact then imagin fun send each them one by one',
 'if you are razr owner you must have thi',
 'needless say wast my money',
 'what wast money and time']

## 2. TD/IDF

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer(analyzer="word",
                             preprocessor=None,
                             stop_words='english',
                             max_features=6000,
                             ngram_range=(1,5))

date_features = vectorizer.fit_transform(preprocessed)
tfidf_transformer = TfidfTransformer()
data_features_tfidf = tfidf_transformer.fit_transform(date_features)
data_matrix = data_features_tfidf.toarray()
data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 3. Creating Training and Test Sets

In [16]:
np.random.seed(0)
test_index = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False),
                       np.random.choice((np.where(y==1))[0], 250, replace=False))
train_index = list(set(range(len(labels))) - set(test_index))

train_data = data_matrix[train_index,]
train_labels = y[train_index]

test_data = data_matrix[test_index,]
test_labels = y[test_index]

## 4. Finding Polarity and Subjectivity (TextBlob)

In [17]:
from textblob import TextBlob

## Creating polarity and subjectivity functions

polarity_function = lambda x: TextBlob(x).sentiment.polarity
subjectivity_function = lambda x: TextBlob(x).sentiment.subjectivity
polarity_list = [polarity_function(x) for x in preprocessed]
subjectivity_list = [subjectivity_function(x) for x in preprocessed]

In [18]:
polarity_list[0:10]

[0.0,
 0.85,
 0.8,
 0.1875,
 0.8,
 0.22619047619047616,
 0.09999999999999999,
 0.0,
 -0.35,
 -0.2]

In [19]:
subjectivity_list[0:10]

[0.0,
 0.8,
 0.75,
 0.3333333333333333,
 0.75,
 0.6011904761904762,
 0.06666666666666667,
 0.0,
 0.5,
 0.0]

## 5. Logistic Regression

In [20]:
from sklearn.linear_model import SGDClassifier

## Fitting classifier on training data
classifier = SGDClassifier(loss="log", penalty="none")
classifier.fit(train_data, train_labels)

## Pull out the parameters (w,b) of the logistic regression model
w = classifier.coef_[0, :]
b = classifier.intercept_

## Get predictions on training and test data
predictions_train = classifier.predict(train_data)
predictions_test = classifier.predict(test_data)

## Computing errors
error_training = np.sum((predictions_train > 0.0 ) != (train_labels > 0.0))
error_testing = np.sum((predictions_test > 0.0) != (test_labels > 0.0))

training_error = float(error_training) / len(train_labels)
testing_error = float(error_testing) / len(test_labels)

In [21]:
training_error

0.0136

In [22]:
testing_error

0.186

# 6. Finding words with strong influence

In [23]:
## Converting vocabulary into a list
vocab = np.array([z[0] for z in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])])

## Getting indices by sorting w
indices = np.argsort(w)

## Words with large negative value
negative_indices = indices[0:50]
negative_words = [str(x) for x in list(vocab[negative_indices])]

positive_indices = indices[-49:-1]
positive_words = [str(x) for x in list(vocab[positive_indices])]

In [24]:
print(negative_words)

['worst', 'sucks', 'poor', 'bad', 'bland', 'disappointing', 'disappointment', 'failed', 'horrible', 'avoid', 'unfortunately', 'slow', 'return', 'rude', 'wasted', 'wasn', 'mediocre', 'flat', 'fly', 'junk', 'appealing', 'stupid', 'tasteless', 'doesn', 'average', 'ok', 'awful', 'terrible', 'dropped', 'ripped', 'garbage', 'disgusting', 'crap', 'mistake', 'sucked', 'waste', 'fails', 'torture', 'dirty', 'par', 'blah', 'pg', 'waste time', 'probably', 'hell', 'improvement', 'selection food', 'cheap', 'att', 'hour']


In [25]:
print(positive_words)

['really good', 'brilliant', 'treat', 'highly recommend', 'entertaining', 'performance', 'decor', 'reasonable', 'friendly', 'favorite', 'fall', 'plus', 'fast', 'joy', 'cast', 'predictable bad', 'haven', 'audio', 'definately', 'fun', 'rocks', 'incredible', 'fabulous', 'perfectly', 'fantastic', 'wonderful', 'best', 'hand', 'bacon', 'comfortable', 'amazing', 'assure', 'awesome', 'cool', 'cooked', 'won disappointed', 'pleased', 'liked', 'beautiful', 'works', 'interesting', 'excellent', 'delicious', 'enjoyed', 'perfect', 'loved', 'nice', 'love']


**Making a function to test on custom reviews**

In [26]:
list_reviews = [
    ["It's a sad movie but very good"],
    ["Waste of my time"],
    ["It is not what like"],
    ["It is not what I m looking for"]
]

polarity_index = {-1 : "negative",  0 : "neutral", 1 : "positive"}

def test_on_samples(my_classifier, list_of_reviews = list_reviews):
    sentiment_test_of_reviews = []
    for x in list_of_reviews:
        sentiment = my_classifier.predict(vectorizer.transform(x))[0]
        sentiment_test_of_reviews.append(polarity_index[sentiment])
    return sentiment_test_of_reviews

## 7. Naive Bayes

In [27]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB().fit(train_data, train_labels)

nb_predictions_test = nb_classifier.predict(test_data)
nb_error_testing = np.sum((nb_predictions_test > 0.0) != (test_labels > 0.0))
nb_error_testing = float(nb_error_testing) / len(test_labels)
nb_error_testing

0.178

In [28]:
## Testing the model trained of Naive Bayes

test_sentiments = test_on_samples(nb_classifier)
test_sentiments

['positive', 'negative', 'negative', 'positive']

## 8. SVM

In [29]:
svm_classifier = SGDClassifier(loss="hinge", penalty="l2")
svm_classifier.fit(train_data, train_labels)

svm_predict_test = svm_classifier.predict(test_data)
svm_error_test = np.sum((svm_predict_test > 0.0) != (test_labels > 0.0))
svm_error_test = float(svm_error_test) / len(test_labels)

svm_error_test

0.2

In [30]:
## Testing the model trained of SVM

test_sentiments = test_on_samples(svm_classifier, [["I would not recommend this movie"]])
test_sentiments

['positive']

## 9. LSTM Networks

In [31]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

max_review_length = 200

tokenizer = Tokenizer(num_words=10000,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)
tokenizer.fit_on_texts(preprocessed)

### Truncate and pad input sequences

X = tokenizer.texts_to_sequences(preprocessed)
X = sequence.pad_sequences(X, maxlen=max_review_length)
X.shape # Shape of data tensor

(3000, 200)

In [32]:
import pandas as pd

Y = pd.get_dummies(y).values
Y

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [33]:
## Generating testing and training data

np.random.seed(0)
test_indices = np.append(np.random.choice((np.where(y == -1))[0], 250, replace=False),
                         np.random.choice((np.where(y == 1))[0], 250, replace=False))
test_data = X[test_indices,]
test_labels = Y[test_indices]

training_indices = list(set(range(len(labels))) - set(test_indices))
training_data = X[training_indices,]
training_labels = Y[training_indices]

In [34]:
## Creating networks

EMBEDDING_DIM = 200
model = Sequential()
model.add(Embedding(10000, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(250, dropout=0.2, return_sequences=True))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          2000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 200)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 200, 250)          451000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               140400    
_________________________________________________________________
dense (Dense)                (None, 2)                 202       
Total params: 2,591,602
Trainable params: 2,591,602
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
model.fit(training_data, training_labels,
          epochs=2,
          batch_size=40,
          validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x18ac1b9b670>

In [36]:
loss, acc = model.evaluate(test_data, test_labels, verbose=2, batch_size=40)
print(f"loss: {loss}")
print(f"Validation accuracy: {acc}")

13/13 - 3s - loss: 0.4166 - accuracy: 0.8360
loss: 0.41656970977783203
Validation accuracy: 0.8360000252723694


In [37]:
outcome_label = ['Negative', 'Positive']
def decide_review(review):
    seq = tokenizer.texts_to_sequences([review])
    padded = sequence.pad_sequences(seq, maxlen=max_review_length)
    predict = model.predict(padded)
    print("Probability Distribution : ", predict)
    print(outcome_label[np.argmax(predict)])

In [38]:
decide_review("It is not what I am looking for")

Probability Distribution :  [[0.9425057  0.05749434]]
Negative


In [39]:
decide_review("This isn't what I am looking for")

Probability Distribution :  [[0.5133868  0.48661324]]
Negative


In [40]:
decide_review("This is perfect")

Probability Distribution :  [[0.03747116 0.9625288 ]]
Positive


In [41]:
decide_review("I want this now")

Probability Distribution :  [[0.1929583 0.8070417]]
Positive
