# Sentiment Analysis using twitter data

Simple Deep Neural Network, that predicts tweet sentiment. The model can be greatly improved using various <br/>
word vectorization techniques and Recurrent Neural Networks.


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
import tflearn


In [4]:
import tensorflow as tf
import pandas as pd 
import preprocessor as p
from nltk.corpus import stopwords
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import tflearn as tflearn

def all_stop_words():
    stop_words = stopwords.words('english')
    add_stopwords = [",", "*" , ")" , "(" ,".","theres","know","one","though","vinci","ive","da","book","im","went",
                    "potter","brokeback","mountain","harry","code","mission","impossible","movie","movies","i","ya",
                    "yet","yall"]
    for w in add_stopwords:
        stop_words.append(w)
    return stop_words


stop_words = all_stop_words()
def training_data_df():
    
    training_data = open("./data/twitter_data.txt",mode='r')
    training_array =[]
    for d in training_data:
        training_dict = {}
        sent_tweet_array = d.split('\t')
        training_dict['tweet'] = sent_tweet_array[1].lower()
        training_dict['cleaned_tweet'] =remove_stop_words(sent_tweet_array[1].lower())
        training_dict['sentiment'] =int(sent_tweet_array[0])
        training_array.append(training_dict)
    training_df = pd.DataFrame(training_array)
    return training_df

def remove_stop_words(tweet_text):
    tweet_text = re.sub(r'[?|$|.|!]',r'',tweet_text)
    tweet_text = re.sub(r'[^a-zA-Z0-9 ]',r'',tweet_text)
    result = ""
    for word in tweet_text.split():            
        if word not in stop_words:
            result = result +" "+word.lower()

    return result.lstrip()

def get_word_frequency(tweet_list):
    word_dict = {} 
    result_array=[]
    for tw in tweet_list:
        for word in tw.split():
            if word in word_dict.keys():
                word_dict[word] = word_dict[word]+1
                continue
            word_dict[word] = 1
    for k in word_dict.keys():
        result_array.append({'word':k, 'count':word_dict[k]})
    return pd.DataFrame(result_array,columns=['count','word'])


def common_pos_words(df_data):
    return common_words_filter(df_data,1)

def common_neg_words(df_data):
    return common_words_filter(df_data,0)

def common_words_filter(df_data,sent):
    pos_tweet =df_data[df_data["sentiment"] ==sent]
    all_tweets = pos_tweet["tweet"].str.cat(sep='|')
    common_words=[]
    for pos_tw in pos_tweet["tweet"]:
        for tw in all_tweets.split('|'):
            if pos_tw == tw:
                continue
            tw_list = list(tw.split())
            pos_list = list(pos_tw.split())
            inter = list(set(tw_list).intersection(pos_list))
            for word in inter:
                if word not in common_words:
                    common_words.append(word)
    return common_words

In [5]:
training_data = training_data_df()
training_data["neg"] = training_data["sentiment"].apply(lambda x: 0 if x==1 else 1)
vect = CountVectorizer(stop_words=all_stop_words())
print("All data:",training_data.shape[0])

All data: 7086


In [6]:
training_data.head(10)

Unnamed: 0,cleaned_tweet,sentiment,tweet,neg
0,awesome,1,the da vinci code book is just awesome.\n,0
1,first clive cussler ever read even books like ...,1,this was the first clive cussler i've ever rea...,0
2,liked lot,1,i liked the da vinci code a lot.\n,0
3,liked lot,1,i liked the da vinci code a lot.\n,0
4,liked ultimatly didnt seem hold,1,i liked the da vinci code but it ultimatly did...,0
5,thats even exaggeration midnight walmart buy a...,1,that's not even an exaggeration ) and at midni...,0
6,loved want something better different,1,"i loved the da vinci code, but now i want some...",0
7,thought great kite runner,1,"i thought da vinci code was great, same with k...",0
8,actually good,1,the da vinci code is actually a good movie...\n,0
9,thought pretty good,1,i thought the da vinci code was a pretty good ...,0


In [7]:
training_data = training_data[['sentiment','neg','cleaned_tweet', 'tweet']]
training_data.values

array([[1, 0, 'awesome', 'the da vinci code book is just awesome.\n'],
       [1, 0,
        'first clive cussler ever read even books like relic plausible',
        "this was the first clive cussler i've ever read, but even books like relic, and da vinci code were more plausible than this.\n"],
       [1, 0, 'liked lot', 'i liked the da vinci code a lot.\n'],
       ..., 
       [0, 1, 'sit watching mtv awards reminded much despised',
        'as i sit here, watching the mtv movie awards, i am reminded of how much i despised the movie brokeback mountain.\n'],
       [0, 1, 'ok horrible',
        'ok brokeback mountain is such a horrible movie.\n'],
       [0, 1, 'oh terrible',
        'oh, and brokeback mountain was a terrible movie.\n']], dtype=object)

### One-hot Encode

In [8]:
vect.fit(training_data["cleaned_tweet"])
word_matrix = vect.transform(training_data["cleaned_tweet"]).toarray()
print(type(word_matrix))
print(word_matrix.shape)

<class 'numpy.ndarray'>
(7086, 2069)


In [9]:
num_features = word_matrix.shape[1]
print("Total features: ",num_features)
classes = training_data.values[:,:2]

Total features:  2069


In [10]:
trX,testX,trY,testY = train_test_split(word_matrix,classes)   
print("Training data:",trX.shape[0])

Training data: 5314


# Deep Neural Network


Consists of : 
<ul>
<li>Input layer </li> 
<li>Hidden layer with 300 neurons</li> 
<li>Output softmax layer, which outputs the likelyhood of a class: [1,0]</li>
</ul>

In [11]:
tf.reset_default_graph()
input_data = tflearn.input_data(shape=[None, num_features])
init_weights = tflearn.initializations.truncated_normal(shape=None,dtype=tf.float32, seed=None)
layer1 = tflearn.layers.fully_connected(input_data,300,activation='sigmoid',weights_init=init_weights,regularizer='L2')
net  =tflearn.layers.fully_connected(layer1 , 2,activation='softmax',regularizer='L2')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
model = tflearn.DNN(net,tensorboard_verbose=3)


In [12]:
model.fit(trX,trY,n_epoch=10,show_metric=True)
evl = model.evaluate(testX,testY)
print("Evaluation: ",evl)

Training Step: 839  | total loss: [1m[32m0.08910[0m[0m | time: 5.193s
| Adam | epoch: 010 | loss: 0.08910 - acc: 0.9868 -- iter: 5312/5314
Training Step: 840  | total loss: [1m[32m0.08663[0m[0m | time: 5.255s
| Adam | epoch: 010 | loss: 0.08663 - acc: 0.9877 -- iter: 5314/5314
--
Evaluation:  [0.99153498723329325]


In [None]:
vect.transform(["awesome"]).toarray().shape

# Inspecting training data

### Positive sentiment

In [13]:
for t in training_data[training_data["sentiment"] ==1]["tweet"].head(10):
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print("Probability:",p)
    print('--------')

Tweet: the da vinci code book is just awesome.

Probability: [[0.9709868431091309, 0.029013078659772873]]
--------
Tweet: this was the first clive cussler i've ever read, but even books like relic, and da vinci code were more plausible than this.

Probability: [[0.9721359610557556, 0.027863970026373863]]
--------
Tweet: i liked the da vinci code a lot.

Probability: [[0.8217340707778931, 0.1782659888267517]]
--------
Tweet: i liked the da vinci code a lot.

Probability: [[0.8217340707778931, 0.1782659888267517]]
--------
Tweet: i liked the da vinci code but it ultimatly didn't seem to hold it's own.

Probability: [[0.8213279843330383, 0.17867204546928406]]
--------
Tweet: that's not even an exaggeration ) and at midnight we went to wal-mart to buy the da vinci code, which is amazing of course.

Probability: [[0.6641566753387451, 0.33584335446357727]]
--------
Tweet: i loved the da vinci code, but now i want something better and different!..

Probability: [[0.9394335746765137, 0.0605664

### Negative sentiment

In [14]:

for t in training_data[training_data["sentiment"] ==0]["tweet"].head(10):
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print("Probability:",p)
    print('---------')

Tweet: da vinci code was a terrible movie.

Probability: [[0.19282272458076477, 0.8071773052215576]]
---------
Tweet: then again, the da vinci code is super shitty movie, and it made like 700 million.

Probability: [[0.7964497804641724, 0.20355020463466644]]
---------
Tweet: the da vinci code comes out tomorrow, which sucks.

Probability: [[0.05570179969072342, 0.9442982077598572]]
---------
Tweet: i thought the da vinci code movie was really boring.

Probability: [[0.10949605703353882, 0.890504002571106]]
---------
Tweet: god, yahoo games has this truly-awful looking da vinci code-themed skin on it's chessboard right now.

Probability: [[0.33692947030067444, 0.6630704998970032]]
---------
Tweet: da vinci code does suck.

Probability: [[0.08318359404802322, 0.9168164134025574]]
---------
Tweet: and better...-we all know da vinci code is bogus and inaccurate.

Probability: [[0.41265419125556946, 0.5873458385467529]]
---------
Tweet: last time, da vinci code is also a bit disappointing t

### Custom data

In [16]:
testing_tweets = ["I despise trump",
                  "The product was bad",
                  "I love this product",
                  "It was an ok experience",
                  "Do you hate me ?"]
for t in testing_tweets:
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print("Probability:",p)
    print('------------')

Tweet: I despise trump
Probability: [[0.5055177211761475, 0.4944821894168854]]
------------
Tweet: The product was bad
Probability: [[0.620330810546875, 0.3796692192554474]]
------------
Tweet: I love this product
Probability: [[0.9722011089324951, 0.027798961848020554]]
------------
Tweet: It was an ok experience
Probability: [[0.3718075156211853, 0.6281924247741699]]
------------
Tweet: Do you hate me ?
Probability: [[0.09526906162500381, 0.9047309160232544]]
------------
