# Sentiment Analysis using twitter data

Simple Deep Neural Network, that predicts tweet sentiment. The model can be greatly improved using various <br/>
word vectorization techniques and Recurrent Neural Networks.


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
import tflearn as tflearn
from nltk.corpus import stopwords
import preprocessor as p
import re
import numpy as np

from sklearn import preprocessing
import tensorflow as tf
from tflearn.data_utils import to_categorical, pad_sequences


In [387]:

test_path = "./data/twitter_test.txt"
training_path = "./data/twitter_data.txt"

def all_stop_words():
    stop_words = stopwords.words('english')
    add_stopwords = [",", "*" , ")" , "(" ,".","theres","know","one","though","vinci","ive","da","book","im","went",
                    "potter","brokeback","mountain","harry","code","mission","impossible","movie","movies","i","ya",
                    "yet","yall"]
    for w in add_stopwords:
        stop_words.append(w)
    return stop_words


stop_words = all_stop_words()
def training_data_df(path):
    
    training_data = open(path,mode='r')
    training_array =[]
    for d in training_data:
        training_dict = {}
        sent_tweet_array = d.split('\t')
        training_dict['tweet'] = str(sent_tweet_array[1].lower())
        training_dict['cleaned_tweet'] =remove_stop_words(sent_tweet_array[1].lower())
        training_dict['positive'] =int(sent_tweet_array[0])
        training_array.append(training_dict)
    training_df = pd.DataFrame(training_array)
    return training_df

def sentiment(sentiment_prob):
    if sentiment_prob[0]>sentiment_prob[1]:
        return "positive"
    return "negative"

def remove_stop_words(tweet_text):
    tweet_text = re.sub(r'[?|$|.|!]',r'',tweet_text)
    tweet_text = re.sub(r'[^a-zA-Z0-9 ]',r'',tweet_text)
    result = ""
    for word in tweet_text.split():            
        result = result +" "+word.lower()

    return result.lstrip()

def get_word_frequency(tweet_list):
    word_dict = {} 
    result_array=[]
    for tw in tweet_list:
        for word in tw.split():
            if word in word_dict.keys():
                word_dict[word] = word_dict[word]+1
                continue
            word_dict[word] = 1
    for k in word_dict.keys():
        result_array.append({'word':k, 'count':word_dict[k]})
    return pd.DataFrame(result_array,columns=['count','word'])


def common_pos_words(df_data):
    return common_words_filter(df_data,1)

def common_neg_words(df_data):
    return common_words_filter(df_data,0)

def common_words_filter(df_data,sent):
    pos_tweet =df_data[df_data["positive"] ==sent]
    all_tweets = pos_tweet["tweet"].str.cat(sep='|')
    common_words=[]
    for pos_tw in pos_tweet["tweet"]:
        for tw in all_tweets.split('|'):
            if pos_tw == tw:
                continue
            tw_list = list(tw.split())
            pos_list = list(pos_tw.split())
            inter = list(set(tw_list).intersection(pos_list))
            for word in inter:
                if word not in common_words:
                    common_words.append(word)
    return common_words

In [552]:
training_data = training_data_df(training_path)
# test_data = training_data_df(test_path)
training_data["neg"] = training_data["positive"].apply(lambda x: 0 if x==1 else 1)
vect = CountVectorizer(stop_words=stop_words)
print("All data:",training_data.shape[0])
# print("Test data:",test_data.shape[0])

All data: 7087


In [553]:
training_data["tweet"].apply(lambda x:len(x.split())).max()

40

In [554]:
print("positive:",len(training_data[training_data["positive"]==1]))
print("negative:",len(training_data[training_data["positive"]!=1]))


positive: 3995
negative: 3092


In [555]:
training_data = training_data[['positive','neg','cleaned_tweet', 'tweet']]
training_data.values
len(training_data["cleaned_tweet"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0))



2233

In [556]:
training_data.head(20)

Unnamed: 0,positive,neg,cleaned_tweet,tweet
0,0,1,i feeel like aaa reetard,i feeel like aaa reetard\n
1,1,0,the da vinci code book is just awesome,the da vinci code book is just awesome.\n
2,1,0,this was the first clive cussler ive ever read...,this was the first clive cussler i've ever rea...
3,1,0,i liked the da vinci code a lot,i liked the da vinci code a lot.\n
4,1,0,i liked the da vinci code a lot,i liked the da vinci code a lot.\n
5,1,0,i liked the da vinci code but it ultimatly did...,i liked the da vinci code but it ultimatly did...
6,1,0,thats not even an exaggeration and at midnight...,that's not even an exaggeration ) and at midni...
7,1,0,i loved the da vinci code but now i want somet...,"i loved the da vinci code, but now i want some..."
8,1,0,i thought da vinci code was great same with ki...,"i thought da vinci code was great, same with k..."
9,1,0,the da vinci code is actually a good movie,the da vinci code is actually a good movie...\n


### One-hot Encode

In [410]:
vocab_processor = tflearn.data_utils.VocabularyProcessor(45)
# vocab = vocab_processor.fit(training_data["tweet"])
vocab = vocab_processor.fit(training_data["cleaned_tweet"])


In [469]:
with open ('mappings.txt','w') as mappings:
    words = []
    for sent in training_data["cleaned_tweet"]:
        mappings.write("{}\n".format(sent))
#         for word in sent.split():            
#             if word in words:
#                 continue
#             index = vocab.vocabulary_.get(word)
#             mappings.write("{}\t{}\n".format(word, index))
#             words.append(word)


In [439]:
len(words)

2233

In [557]:
vocab_array =vocab.transform(training_data["cleaned_tweet"])
vocab_array =np.asmatrix(np.array(list(vocab_array)))
print(vocab_array)

                         

[[   1    2    3 ...,    0    0    0]
 [   6    7    8 ...,    0    0    0]
 [  14   15    6 ...,    0    0    0]
 ..., 
 [ 163    1 2230 ...,    0    0    0]
 [ 411 1179 1180 ...,    0    0    0]
 [ 188   26 1179 ...,    0    0    0]]


In [558]:
classes = training_data.values[:,:2].astype(float)

In [564]:
trX,testX,trY,testY = train_test_split(vocab_array,classes)   
# trainY = tflearn.data_utils.to_categorical(trY, nb_classes=2)
# testY = tflearn.data_utils.to_categorical(testY, nb_classes=2)

print("Training data:",trX.shape)
print("Test data:",testX.shape)
trY

Training data: (5315, 45)
Test data: (1772, 45)


array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])

# Deep Neural Network


Consists of : 
<ul>
<li>Input layer </li> 
<li>Hidden layer with 300 neurons</li> 
<li>Output softmax layer, which outputs the likelyhood of a class: [1,0]</li>
</ul>

In [2]:
from tensorflow.contrib import rnn


In [574]:
net = tflearn.lstm(embedding, 45)
net = tflearn.fully_connected(net, 2, activation='sigmoid',regularizer='L2')

In [575]:
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

In [579]:
tf.reset_default_graph()
num_lstm_units = 100


sess = tf.Session()
input_data = tflearn.input_data(shape=[None,45],name='input_data')
embedding = tflearn.embedding(input_data, input_dim=2235, output_dim=45,name='word_embeddings')
net = tflearn.lstm(embedding, num_lstm_units,name='lstm')


net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy',name='regression')
model = tflearn.DNN(net,tensorboard_verbose=3,checkpoint_path='/tmp/tflearn_logs/model.ckpt',session=sess)
tflearn.helpers.summarize(input_data,type='histogram',name='input_sum')


init_op = tf.global_variables_initializer()

# with tf.Session() as sess:
sess.run(init_op)
model.fit(trX,trY, show_metric=True,batch_size=32,n_epoch=5)
# sess.run(print(embedding))
config = projector.ProjectorConfig()

# You can add multiple embeddings. Here we add only one.
viz_emd = config.embeddings.add()
viz_emd.tensor_name = embedding.name
# Link this tensor to its metadata file (e.g. labels).
viz_emd.metadata_path = os.path.join("/tmp/tflearn_logs", 'metadata.tsv')

# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter("/tmp/tflearn_logs")

# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(summary_writer, config)



Training Step: 834  | total loss: [1m[32m0.17495[0m[0m | time: 9.518s
| Adam | epoch: 005 | loss: 0.17495 - acc: 0.9549 -- iter: 5312/5315
Training Step: 835  | total loss: [1m[32m0.16255[0m[0m | time: 9.573s
| Adam | epoch: 005 | loss: 0.16255 - acc: 0.9594 -- iter: 5315/5315
--
INFO:tensorflow:/tmp/tflearn_logs/model.ckpt-835 is not in all_model_checkpoint_paths. Manually adding it.


INFO:tensorflow:/tmp/tflearn_logs/model.ckpt-835 is not in all_model_checkpoint_paths. Manually adding it.


In [1]:
sess.close()

NameError: name 'sess' is not defined

In [479]:
embedding.name

'word_embeddings/embedding_lookup:0'

In [482]:
from tensorflow.contrib.tensorboard.plugins import projector
import os
# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto

with tf.Session() as sess:
    saver = tf.train.Saver([images])

#     sess.run(images.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'images.ckpt'))
    config = projector.ProjectorConfig()

    # You can add multiple embeddings. Here we add only one.
    viz_emd = config.embeddings.add()
    viz_emd.tensor_name = "word_embeddings/W"
    # Link this tensor to its metadata file (e.g. labels).
    viz_emd.metadata_path = os.path.join("/tmp/tflearn_logs", 'metadata.tsv')

    # Use the same LOG_DIR where you stored your checkpoint.
    summary_writer = tf.summary.FileWriter("/tmp/tflearn_logs")

    # The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
    # read this file during startup.
    projector.visualize_embeddings(summary_writer, config)


In [483]:
viz_emd.tensor_name

'word_embeddings/W'

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt
def auc_curve():
    test_y=testY[:,0]
    pred = []
    i=0
    for t in testX:
        p = model.predict([t])
        p_toarray = np.asarray(p)[0]
        if p_toarray[0]>p_toarray[1]:
            pred.append(1.0)
        else:
            pred.append(0.0)
    print(metrics.classification_report(test_y.astype(float), np.asarray(pred)))
    print(metrics.confusion_matrix(test_y.astype(float), pred))
    fpr, tpr, thresholds = metrics.roc_curve(test_y, np.asarray(pred))
    roc_auc = auc(fpr, tpr)
    print('AUC = %0.4f'% roc_auc)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Sentiment Rate')
    plt.xlabel('False Positive Sentiment Rate')
    plt.show()
    

auc_curve()

# Inspecting training data

### Positive sentiment

In [284]:

for t in training_data[training_data["positive"] ==1]["tweet"].head(10):
    print("Tweet:",t)
    enc = vocab_processor.fit_transform([t])
    print(enc)
    p = model.predict(list(enc))
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

Tweet: the da vinci code book is just awesome.

<generator object VocabularyProcessor.transform at 0x1ab954888>
[ 0.9755044  0.1289884]
sentiment: positive
--------
Tweet: this was the first clive cussler i've ever read, but even books like relic, and da vinci code were more plausible than this.

<generator object VocabularyProcessor.transform at 0x18227cdb0>
[ 0.97549349  0.1290116 ]
sentiment: positive
--------
Tweet: i liked the da vinci code a lot.

<generator object VocabularyProcessor.transform at 0x1ab954888>
[ 0.97550464  0.1289878 ]
sentiment: positive
--------
Tweet: i liked the da vinci code a lot.

<generator object VocabularyProcessor.transform at 0x18227cdb0>
[ 0.97550464  0.1289878 ]
sentiment: positive
--------
Tweet: i liked the da vinci code but it ultimatly didn't seem to hold it's own.

<generator object VocabularyProcessor.transform at 0x1ab954888>
[ 0.97550303  0.1289911 ]
sentiment: positive
--------
Tweet: that's not even an exaggeration ) and at midnight we wen

### Negative sentiment

In [285]:

for t in training_data[training_data["positive"] ==0]["tweet"].head(10):
    print("Tweet:",t)
    enc = vocab_processor.fit_transform(t)
    p = model.predict(list(enc))
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

Tweet: i feeel like aaa reetard

[ 0.97550493  0.12898722]
sentiment: positive
--------
Tweet: da vinci code was a terrible movie.

[ 0.00477899  0.91768086]
sentiment: negative
--------
Tweet: then again, the da vinci code is super shitty movie, and it made like 700 million.

[ 0.0047802   0.91766834]
sentiment: negative
--------
Tweet: the da vinci code comes out tomorrow, which sucks.

[ 0.0047802   0.91766834]
sentiment: negative
--------
Tweet: i thought the da vinci code movie was really boring.

[ 0.97550493  0.12898722]
sentiment: positive
--------
Tweet: god, yahoo games has this truly-awful looking da vinci code-themed skin on it's chessboard right now.

[ 0.97550493  0.12898704]
sentiment: positive
--------
Tweet: da vinci code does suck.

[ 0.00477899  0.91768086]
sentiment: negative
--------
Tweet: and better...-we all know da vinci code is bogus and inaccurate.

[ 0.97550052  0.12899639]
sentiment: positive
--------
Tweet: last time, da vinci code is also a bit disappoint

### Custom data

In [286]:
testing_file = open(test_path,mode='r')
testing_tweets=[]
for x in testing_file:
    s= remove_stop_words(x)
    testing_tweets.append(s)

testing_tweets=["Besides the tasty food, the service is incredible!",
                "Horrible service. Absolutely no sense of customer service and how to take an order. Your order will never reach you."]

# testing_tweets[100:200]    
for t in testing_tweets:
    print("Tweet:",t)
    enc = vocab_processor.fit_transform(t)
    p = model.predict(list(enc))
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

Tweet: Besides the tasty food, the service is incredible!
[ 0.97550392  0.12898931]
sentiment: positive
--------
Tweet: Horrible service. Absolutely no sense of customer service and how to take an order. Your order will never reach you.
[ 0.97550392  0.12898931]
sentiment: positive
--------


In [206]:
print(vect.transform(["satisfied"]))




In [290]:
matrix = np.random.random([1024, 64])  # 64-dimensional embeddings
ids = np.array([0, 5, 17, 33])
matrix.shape
#print (matrix[ids] ) # prints a matrix of shape [4, 64] 

(1024, 64)