In [4]:
import nltk
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from transformers import BertTokenizer
from multiprocessing import Pool, TimeoutError
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy
from keras.utils.np_utils import to_categorical  

In [5]:
# Read in the data
reviews = pd.read_csv('Reviews.csv')
reviews = reviews.sample(frac=0.2)

In [6]:
# Clean the data
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
reviews_without_stopwords = [word for word in reviews['Text'].str.split() if word not in stop_words]

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
punctuation = list(string.punctuation)

In [8]:
# Set everything to lower case and remove punctuation
reviews_without_stopwords_or_punctuation = []
# reviews_without_stopwords_or_punctuation_full_list = []
for st in reviews_without_stopwords:
	tmp = []
	for word in st:
		if word not in punctuation:
			tmp.append(word.lower())
			# reviews_without_stopwords_or_punctuation_full_list.append(word.lower())
	reviews_without_stopwords_or_punctuation.append(tmp)

In [9]:
# print(len(reviews_without_stopwords_or_punctuation))
print(reviews_without_stopwords_or_punctuation[0])
print(len(reviews_without_stopwords_or_punctuation))
# print(reviews['Score'])

['oh', 'man-', 'this', 'stuff', 'is', 'soooooo', 'good!', 'we', 'cook', 'it', 'up', 'on', 'the', 'stove,', 'it', 'is', 'exactly', 'like', 'the', 'movies.', 'in', 'fact,', 'i', 'never', 'drink', 'soda', 'pop', 'ever,', 'and', "it's", 'so', 'delicious', 'and', 'movie', 'theater', 'like', 'that', 'i', 'have', 'to', 'make', 'my', 'husband', 'go', 'grab', 'me', 'a', 'big', 'pepsi', 'from', 'the', 'gas', 'station', 'just', 'so', 'i', 'can', 'eat', 'it.', 'maybe', "that's", 'weird', 'or', 'maybe', 'that', 'tells', 'you', 'how', 'much', "it's", 'like', 'the', 'movies', 'lol.']
113691


In [10]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()


def lemmatize_string(word_list):
	return ' '.join([lemmatizer.lemmatize(words) for words in word_list])

with Pool(processes=8) as pool:
	reviews_without_stopwords_or_punctuation_lemmatize = pool.map(lemmatize_string, reviews_without_stopwords_or_punctuation)

[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
def lemmatize_string_list(word_list):
	return [lemmatizer.lemmatize(words) for words in word_list]

with Pool(processes=8) as pool:
	reviews_without_stopwords_or_punctuation_lemmatize_list = pool.map(lemmatize_string_list, reviews_without_stopwords_or_punctuation)

In [11]:
reviews_without_stopwords_or_punctuation_lemmatize[1]

'the banana bread wa quite delicious. it even tasted great when you warmed it for 30 second in the microwave. i thought it wa going to be dry and bland. but it wa the quite opposite.'

In [12]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True, stop_words=None,
                     ngram_range=(1, 1), tokenizer=token.tokenize)
reviews_tokenized = cv.fit_transform(reviews_without_stopwords_or_punctuation_lemmatize)

In [13]:
print(reviews_tokenized.shape)

(113691, 59407)


In [14]:
review_score = [int(i) for i in reviews['Score']]

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(reviews_tokenized, review_score, test_size=0.2, random_state=0)

# Part 1

In [36]:


# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(reviews['Text'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 65693 unique tokens.


In [38]:
X = tokenizer.texts_to_sequences(reviews['Text'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (113691, 250)


In [66]:
Y = np.asarray(reviews['Score'])
y_numpy = numpy.array(reviews['Score'])
y_one_hot  = np.zeros((y_numpy.size, y_numpy.max()+1))
y_one_hot[np.arange(y_numpy.size),y_numpy] = 1

In [67]:
x_train_lstm, x_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, y_one_hot, test_size=0.2, random_state=0)
print(x_train_lstm.shape,y_train_lstm.shape)
print(x_test_lstm.shape,y_test_lstm.shape)

(90952, 250) (90952, 6)
(22739, 250) (22739, 6)


In [68]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

In [71]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_9 (Dense)              (None, 6)                 606       
Total params: 5,081,006
Trainable params: 5,081,006
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
epochs = 5
batch_size = 64

history = model.fit(x_train_lstm, y_train_lstm, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5


In [13]:
# from sklearn.tree import DecisionTreeClassifier 
# from sklearn import metrics
# clf = DecisionTreeClassifier().fit(X_train, y_train)
# predicted= clf.predict(X_test)
# print("Decision Tree Accuracy:",metrics.accuracy_score(y_test, predicted))
# print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
# print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [14]:
# from sklearn.svm import SVC
# clf = SVC().fit(X_train, y_train)
# predicted= clf.predict(X_test)
# print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
# print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
# print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

# Part 2

In [21]:
import gensim
from gensim.models import Word2Vec
import gensim.downloader as gensim_api

In [23]:
embeddings = gensim_api.load("glove-twitter-25")
evbeddings2 = gensim_api.load("word2vec-google-news-300")



In [26]:
reviews_without_stopwords_or_punctuation_lemmatize[0]

"oh man- this stuff is soooooo good! we cook it up on the stove, it is exactly like the movies. in fact, i never drink soda pop ever, and it's so delicious and movie theater like that i have to make my husband go grab me a big pepsi from the gas station just so i can eat it. maybe that's weird or maybe that tell you how much it's like the movie lol."

In [25]:
model_vector = (np.mean([evbeddings2[token] for token in reviews_without_stopwords_or_punctuation_lemmatize], axis=0)).tolist()

KeyError: "Key 'oh man- this stuff is soooooo good! we cook it up on the stove, it is exactly like the movies. in fact, i never drink soda pop ever, and it's so delicious and movie theater like that i have to make my husband go grab me a big pepsi from the gas station just so i can eat it. maybe that's weird or maybe that tell you how much it's like the movie lol.' not present"

# Part 3

In [3]:
# Using bert
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [16]:
reviews_tokenized_bert = tokenizer.batch_encode_plus(reviews_without_stopwords_or_punctuation_lemmatize, max_length=512, pad_to_max_length=True, return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
print(reviews_tokenized_bert['input_ids'])

tensor([[ 101, 1045, 2031,  ...,    0,    0,    0],
        [ 101, 1045, 2109,  ...,    0,    0,    0],
        [ 101, 2023, 1037,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2066,  ...,    0,    0,    0],
        [ 101, 1045, 2293,  ...,    0,    0,    0],
        [ 101, 1045, 4156,  ...,    0,    0,    0]])


In [18]:
print(reviews_tokenized_bert.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews_tokenized_bert['input_ids'], review_score, test_size=0.2, random_state=0)

In [20]:
from sklearn import metrics
from sklearn.svm import SVC
clf = SVC().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Linear Support Vector Machine
from sklearn import metrics
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=1000).fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Linear Support Vector Machine Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

Linear Support Vector Machine Accuracy: 0.4991864198073794
Precision: 0.21536380155123752
Recall: 0.21034048696583216


