In [43]:
# Sentiment Analysis model - Daniel shalam

import tflearn
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [44]:
#import sys
#!{sys.executable} -m pip install keras

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [74]:
# Getting data
# I used amazons review - not the best choice but due to time limitations
# The data is from kaggle
dataset = pd.read_csv("kindle_reviews.csv")
df = dataset[:200]

df = df[df['overall'] != 3]
df['Positively Rated'] = np.where(df['overall'] > 3, 1, 0)

In [73]:
# Cleaning the texts
# Using nltk list of "stopwords" I remove those word from the data, as well as remove commonly occurring words.
nltk.download('stopwords')
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['reviewText'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
# Data preprocessing -
# Make vector of word by TfidfVectorizer (working similar to shenon Entropy calculation)
# (It will take each word calculate its probability add it to is probability log, and give it a weight-similar to huffman)

tfidf = TfidfVectorizer()
tfidf.fit(df['reviewText'])

# Delete score 3 - as a neutral score
# United 4,5 as positive and 1,2 as negative
# If i had more time i would made a model which classify to a different levels of scores

df.dropna(inplace=True)
cols = ['overall']
df.drop(cols, axis=1, inplace=True)


In [76]:
# Split data to train,test
# y is the label which correspondes to the score
# X is the new text data

X = df['reviewText']
y = df['Positively Rated']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
                                                                             (len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
                                                                        (len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
                                                                             (len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
                                                                            (len(X_test[y_test == 1]) / (len(X_test)*1.))*100))


Train set has total 127 entries with 15.75% negative, 84.25% positive
Test set has total 43 entries with 23.26% negative, 76.74% positive


In [79]:
# Padding reviews for equal inpt size
# Make string sequence of ints for embedding
max_fatures = len(df)
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['reviewText'].values)
X1 = tokenizer.texts_to_sequences(df['reviewText'].values)
X1 = pad_sequences(X1)
Y1 = pd.get_dummies(df['Positively Rated']).values
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1,Y1, random_state = 42)
print(X1_train.shape,Y1_train.shape)
print(X1_test.shape,Y1_test.shape)

(127, 356) (127, 2)
(43, 356) (43, 2)


In [80]:
# model architecture - RNN model with 1 embedding, 1 LSTM and 1 dense layers
# Using Embedding layer for input sequence of word it will return one embedding (total 2D vector)

embed_dim = 150
lstm_out = 200
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X1.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2,dropout_W=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  import sys
  


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 356, 150)          25500     
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 306,702
Trainable params: 306,702
Non-trainable params: 0
_________________________________________________________________
None


In [81]:
batch_size = 32
model.fit(X1_train, Y1_train, nb_epoch = 3, batch_size=batch_size, verbose = 2)

  


Epoch 1/3
 - 14s - loss: 0.6558 - accuracy: 0.8189
Epoch 2/3
 - 7s - loss: 0.4126 - accuracy: 0.8740
Epoch 3/3
 - 7s - loss: 0.4215 - accuracy: 0.8740


<keras.callbacks.callbacks.History at 0x1531ec9cda0>

In [82]:
score,acc = model.evaluate(X1_test, Y1_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))


score: 0.66
acc: 0.67


In [85]:
model.save("model.h5")

In [None]:
def accuracy_summary(pipeline, X_train, y_train, X_test, y_test):
    sentiment_fit = pipeline.fit(X_train, y_train)
    y_pred = sentiment_fit.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    return accuracy