# LDA 
[Latent Dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) is a technique by which 

In [203]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim import corpora, models
from pprint import pprint
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, Conv1D, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

data = pd.read_csv("data/Reviews.csv", nrows=10000)
data_text = pd.DataFrame(data["Text"])
data_text['index'] = data_text.index
data_text.head()
stemmer = SnowballStemmer('english')

In [2]:
len(data_text)

10000

In [3]:
def lemmatize_and_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_and_stem(token))
    return result

In [4]:
processed_docs = data_text["Text"].map(preprocess)
processed_docs[:5]

0    [buy, vital, can, food, product, good, qualiti...
1    [product, arriv, label, jumbo, salt, peanut, p...
2    [confect, centuri, light, pillowi, citrus, gel...
3    [look, secret, ingredi, robitussin, believ, ad...
4    [great, taffi, great, price, wide, assort, yum...
Name: Text, dtype: object

In [5]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## Bag of Words

In [6]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[:2]

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 3),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(11, 2),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1)]]

In [7]:
bow_doc_0 = bow_corpus[0]

for i in range(len(bow_doc_0)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                                     dictionary[bow_doc_0[i][0]], 
                                                     bow_doc_0[i][1]))

Word 0 ("appreci") appears 1 time.
Word 1 ("better") appears 2 time.
Word 2 ("buy") appears 1 time.
Word 3 ("can") appears 1 time.
Word 4 ("finicki") appears 1 time.
Word 5 ("food") appears 1 time.
Word 6 ("good") appears 1 time.
Word 7 ("like") appears 1 time.
Word 8 ("look") appears 1 time.
Word 9 ("meat") appears 1 time.
Word 10 ("process") appears 1 time.
Word 11 ("product") appears 3 time.
Word 12 ("qualiti") appears 1 time.
Word 13 ("smell") appears 1 time.
Word 14 ("stew") appears 1 time.


In [126]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}\n".format(idx, topic))

Topic: 0 
Words: 0.043*"coffe" + 0.026*"flavor" + 0.020*"tast" + 0.018*"like" + 0.013*"good" + 0.012*"love" + 0.011*"drink" + 0.011*"product" + 0.009*"great" + 0.009*"tri"

Topic: 1 
Words: 0.020*"like" + 0.016*"tast" + 0.014*"product" + 0.014*"good" + 0.012*"great" + 0.010*"love" + 0.009*"flavor" + 0.008*"food" + 0.007*"time" + 0.007*"order"



## TF-IDF

In [127]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=2, id2word=dictionary, passes=2, workers=4)

In [128]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}\n'.format(idx, topic))

Topic: 0 
Word: 0.012*"coffe" + 0.009*"flavor" + 0.009*"tast" + 0.007*"good" + 0.006*"product" + 0.005*"like" + 0.005*"vanilla" + 0.005*"love" + 0.005*"tri" + 0.005*"drink"

Topic: 1 
Word: 0.017*"coffe" + 0.008*"flavor" + 0.007*"like" + 0.007*"great" + 0.007*"love" + 0.006*"cup" + 0.006*"tast" + 0.006*"order" + 0.005*"good" + 0.005*"product"



# Testing Models

In [129]:
for index, score in sorted(lda_model[bow_corpus[45]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic {}: {}".format(score, index, lda_model.print_topic(index, 10)))


Score: 0.8525947332382202	 
Topic 1: 0.020*"like" + 0.016*"tast" + 0.014*"product" + 0.014*"good" + 0.012*"great" + 0.010*"love" + 0.009*"flavor" + 0.008*"food" + 0.007*"time" + 0.007*"order"

Score: 0.14740528166294098	 
Topic 0: 0.043*"coffe" + 0.026*"flavor" + 0.020*"tast" + 0.018*"like" + 0.013*"good" + 0.012*"love" + 0.011*"drink" + 0.011*"product" + 0.009*"great" + 0.009*"tri"


In [130]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic {}: {}".format(score, index, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9146600961685181	 
Topic 1: 0.017*"coffe" + 0.008*"flavor" + 0.007*"like" + 0.007*"great" + 0.007*"love" + 0.006*"cup" + 0.006*"tast" + 0.006*"order" + 0.005*"good" + 0.005*"product"

Score: 0.08533991128206253	 
Topic 0: 0.012*"coffe" + 0.009*"flavor" + 0.009*"tast" + 0.007*"good" + 0.006*"product" + 0.005*"like" + 0.005*"vanilla" + 0.005*"love" + 0.005*"tri" + 0.005*"drink"


In [152]:
classes = []
for text in bow_corpus:
    index, score = sorted(lda_model[text], key=lambda tup: -1*tup[1])[0]
    classes.append(index)
        

# Sentiment Analysis using Topics

In [167]:
full_text = [" ".join(doc) for doc in processed_docs]

In [168]:
X_train, X_test, y_train, y_test = train_test_split(full_text, classes, test_size=0.20, random_state=42)

In [169]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [170]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [172]:
embeddings_dictionary = dict()
size = 200
glove_file = open('glove.6B/glove.6B.'+str(size)+'d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [173]:
embedding_matrix = np.zeros((vocab_size, size))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Sentiment Analysis

Use neural networks to predict the sentiment of bodies of text. These are across two topics which were generated using LDA.

## Dense NN

In [174]:
model = Sequential()
embedding_layer = Embedding(vocab_size, size, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [175]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 100, 200)          2046400   
_________________________________________________________________
flatten_9 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 20001     
Total params: 2,066,401
Trainable params: 20,001
Non-trainable params: 2,046,400
_________________________________________________________________
None


In [176]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Convolutional NN

In [177]:
model = Sequential()

embedding_layer = Embedding(vocab_size, size, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 100, 200)          2046400   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 96, 128)           128128    
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 129       
Total params: 2,174,657
Trainable params: 128,257
Non-trainable params: 2,046,400
_________________________________________________________________


In [178]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Recurrant NN

In [200]:
model = Sequential()
embedding_layer = Embedding(vocab_size, size, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 100, 200)          2046400   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               168448    
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 129       
Total params: 2,214,977
Trainable params: 168,577
Non-trainable params: 2,046,400
_________________________________________________________________


In [201]:
es = EarlyStopping(monitor='val_loss', patience=3)

In [202]:
history = model.fit(X_train, y_train, batch_size=64, epochs=50, verbose=1, validation_split=0.2, callbacks=[es])

score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
