# TensorFlow - Sentiment Analysis using twitter data 

Simple Deep Neural Network, that predicts tweet sentiment. The model can be greatly improved using various <br/>
word vectorization techniques and Recurrent Neural Networks.

This simple NN uses a feature vector of n words. This vector is developed using each tweet in the dataset.
Before the word vector is created each tweet is stripped of any stopwords and special characters.

The accuracy of this model is 94%. The model was trained using Michigan University's twitter sentiment data. <br/> 

In [484]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
import tflearn as tflearn
from nltk.corpus import stopwords
import nltk as nltk
import preprocessor as p
import re
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

In [397]:

test_path = "./data/twitter_test.txt"
training_path = "./data/twitter_data.txt"

def all_stop_words():
    stop_words = stopwords.words('english')
    add_stopwords = [",", "*" , ")" , "(" ,".","theres","know","one","though","vinci","ive","da","book","im","went",
                    "potter","brokeback","mountain","harry","code","mission","impossible","movie","movies","i","ya",
                    "yet","yall"]
    for w in add_stopwords:
        stop_words.append(w)
    return stop_words


stop_words = all_stop_words()
english_dict= nltk.corpus.words.words()
def training_data_df(path):
    
    training_data = open(path,mode='r')
    training_array =[]
    for d in training_data:
        training_dict = {}
        sent_tweet_array = d.split('\t')
        training_dict['tweet'] = sent_tweet_array[1].lower()
        training_dict['cleaned_tweet'] =remove_stop_words(sent_tweet_array[1].lower())
        training_dict['positive'] =int(sent_tweet_array[0])
        training_array.append(training_dict)
    training_df = pd.DataFrame(training_array)
    return training_df

def sentiment(sentiment_prob):
    if sentiment_prob[0]>sentiment_prob[1]:
        return "positive"
    return "negative"

def remove_stop_words(tweet_text):
    tweet_text = re.sub(r'[?|$|.|!]',r'',tweet_text)
    tweet_text = re.sub(r'[^a-zA-Z0-9 ]',r'',tweet_text)
    result = ""
    for word in tweet_text.split():            
        if word not in stop_words:
            if word.lower() in english_dict:
                result = result +" "+word.lower()

    return result.lstrip()

def get_word_frequency(tweet_list):
    word_dict = {} 
    result_array=[]
    for tw in tweet_list:
        for word in tw.split():
            if word in word_dict.keys():
                word_dict[word] = word_dict[word]+1
                continue
            word_dict[word] = 1
    for k in word_dict.keys():
        result_array.append({'word':k, 'count':word_dict[k]})
    return pd.DataFrame(result_array,columns=['count','word'])


def common_pos_words(df_data):
    return common_words_filter(df_data,1)

def common_neg_words(df_data):
    return common_words_filter(df_data,0)

def common_words_filter(df_data,sent):
    pos_tweet =df_data[df_data["positive"] ==sent]
    all_tweets = pos_tweet["tweet"].str.cat(sep='|')
    common_words=[]
    for pos_tw in pos_tweet["tweet"]:
        for tw in all_tweets.split('|'):
            if pos_tw == tw:
                continue
            tw_list = list(tw.split())
            pos_list = list(pos_tw.split())
            inter = list(set(tw_list).intersection(pos_list))
            for word in inter:
                if word not in common_words:
                    common_words.append(word)
    return common_words

In [398]:
training_data = training_data_df(training_path)
training_data["neg"] = training_data["positive"].apply(lambda x: 0 if x==1 else 1)
vect = CountVectorizer(stop_words=all_stop_words(),binary=True)
print("All data:",training_data.shape[0])


All data: 7087


In [480]:
training_data = training_data[['positive','neg','cleaned_tweet', 'tweet']]


### VECTORIZING TWEETS

In [473]:
vect.fit(training_data["cleaned_tweet"])
word_matrix = vect.transform(training_data["cleaned_tweet"]).toarray()
print(word_matrix.shape)
pd.DataFrame(word_matrix, columns=vect.get_feature_names()).head(10)

(7087, 1243)


Unnamed: 0,able,abortion,absolute,absolutely,absurd,academy,acceptable,accompaniment,according,account,...,yea,yeah,year,yes,yesterday,yip,young,younger,yuck,yuh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [474]:
num_features = word_matrix.shape[1]
print("Total features: ",num_features)
classes = training_data.values[:,:2]

Total features:  1243


In [475]:
trX,testX,trY,testY = train_test_split(word_matrix,classes,random_state=42)   
trX = trX.astype(float)
print("Training data:",trX.shape[0])
print("Test data:",testX.shape[0])

Training data: 5315
Test data: 1772


# Deep Neural Network


Consists of : 
<ul>
<li>Input layer </li> 
<li>Hidden layer with 500 neurons</li> 
<li>Output softmax layer, which outputs the likelyhood of a class: [1,0]</li>
</ul>



<img src="./data/tensor_flow_sentiment/nn-architecture.png"/>

In [482]:
tf.reset_default_graph()
#--NETWORK STARTS ENDS HERE -----
input_data = tf.placeholder('float32',shape=[None,num_features],name="input_data")
class_output = tf.placeholder('float32',shape=[None,2])

n_neurons = num_features+200
reg_lambda =0.0001
with tf.name_scope("network_info") as scope:
    hidden_layer1 = {
        'weights':tf.cast(
            tf.Variable(
                tf.truncated_normal(
                    name='layer1_weights',
                    shape=[num_features,n_neurons])
            ),tf.float32),

        'bias':tf.cast(
            tf.Variable(tf.zeros(
                    name='layer1_bias',
                    shape=[n_neurons])
                       ),
            tf.float32)
    }
    tf.summary.histogram("weights", hidden_layer1["weights"])
    tf.summary.histogram("bias", hidden_layer1["bias"])
    tf.summary.scalar("avg_weights",tf.reduce_mean(hidden_layer1["weights"]))
    tf.summary.scalar("avg_bias",tf.reduce_mean(hidden_layer1["bias"]))
    

layer1_logit =tf.add(
                tf.matmul(
                    input_data,
                    hidden_layer1['weights']),
                hidden_layer1['bias'],
                name = "layer1_logit")

sigmoid_layer1 = tf.sigmoid(layer1_logit,name='sigmoid_layer1')

with tf.name_scope("output_layer"):
    output_layer = {
        "weights":tf.Variable(tf.truncated_normal([n_neurons, 2])),
        "biases":tf.Variable(tf.zeros([2]))
    }
    tf.summary.histogram("output_weights", output_layer["weights"])
    tf.summary.histogram("output_bias", output_layer["biases"])

output_logit =tf.add(
                    tf.matmul(
                        sigmoid_layer1,
                        output_layer["weights"]),
               output_layer["biases"])

predicted_output = tf.nn.softmax(output_logit,name="softmax")

#--NETWORK SETUP ENDS HERE -----

#--- COST FUNCTION / ACCURACY STARTS HERE ----
with tf.name_scope("cost_function") as scope:
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        logits=predicted_output,
        labels=class_output,
        name = "cross_entropy"

    )
    
    #regularize to reduce overfitting
    regularize =tf.nn.l2_loss(hidden_layer1["weights"],name="hidden_layer_l2")+tf.nn.l2_loss(output_layer["weights"],name="output_l2")
                    
    cross_entropy = tf.reduce_mean(cross_entropy+reg_lambda*regularize) 
    cost_summary_OP = tf.summary.scalar("loss", cross_entropy)
    reg_summary = tf.summary.scalar("regularized",regularize)



with tf.name_scope("accuracy") as scope:
    correct_prediction = tf.equal(
        tf.argmax(predicted_output, 1), 
        tf.argmax(class_output, 1))
    accuracy = tf.reduce_mean(
        tf.cast(correct_prediction, tf.float32),
        name="acc")
    accuracy_summary_OP = tf.summary.scalar("accuracy", accuracy)


with tf.name_scope("training") as scope:
    learning_rate = 0.01
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)

#--- COST FUNCTION / ACCURACY ENDS HERE ----
    
all_summary_OPS = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
epochs =1000
batch_size=20
with tf.Session() as sess:       
    sess.run(init_op)    
    writer = tf.summary.FileWriter("./tensorflow/sentiment", sess.graph_def)
    for i in range(epochs):   
        
        #MINI-BATCH
        offset = (i * batch_size) % (trY.shape[0] - batch_size)
        batch_data = trX[offset:(offset + batch_size), :]
        batch_labels = trY[offset:(offset + batch_size), :]
        sess.run([train_step],
                        feed_dict={
                            input_data:trX,
                            class_output:trY
            })
        
        writer.add_summary(summary, i)
        acc,summary,loss= sess.run([
            accuracy,
            all_summary_OPS,
            cross_entropy], 
            feed_dict={
                input_data:trX,
                class_output:trY
            })
        if i % 5 != 0:continue
        print ("Epoch: %d | Accuracy: %.2f | Loss: %.2f" %(i,acc,loss))

    print("final accuracy on test set: %s" %str(sess.run(accuracy, 
                                                     feed_dict={
                                                            input_data:testX,
                                                            class_output: testY
                })))
    
    #checking how the model works for test tweets
    preds = open("./preds.txt","w")
    pred_dict = []
    test_data = open(test_path,"r")
    pred_dict = []
    for t in test_data:
        enc = vect.transform([t]).toarray().astype(float)
        p = sess.run(predicted_output,feed_dict={input_data : enc})
        to_write = t + "--" + str(p[0]) + "\n"
        pred_dict.append({ 'word':t, 'positive_prob':p[0][0],'negative_prob':p[0][1] })

        preds.write(to_write)
    pred_df = pd.DataFrame(pred_dict)
            
    preds.close()    
    saver = tf.train.Saver()
    saver.save(sess, "./tensorflow/sentiment/model.ckpt")
    
    

Epoch: 0 | Accuracy: 0.74 | Loss: 68.90
Epoch: 5 | Accuracy: 0.87 | Loss: 62.75
Epoch: 10 | Accuracy: 0.92 | Loss: 57.09
Epoch: 15 | Accuracy: 0.94 | Loss: 51.89
Epoch: 20 | Accuracy: 0.95 | Loss: 47.13
Epoch: 25 | Accuracy: 0.95 | Loss: 42.78
Epoch: 30 | Accuracy: 0.96 | Loss: 38.81
Epoch: 35 | Accuracy: 0.96 | Loss: 35.19
Epoch: 40 | Accuracy: 0.96 | Loss: 31.90
Epoch: 45 | Accuracy: 0.96 | Loss: 28.92
Epoch: 50 | Accuracy: 0.96 | Loss: 26.21
Epoch: 55 | Accuracy: 0.96 | Loss: 23.75
Epoch: 60 | Accuracy: 0.96 | Loss: 21.52
Epoch: 65 | Accuracy: 0.96 | Loss: 19.50
Epoch: 70 | Accuracy: 0.96 | Loss: 17.66
Epoch: 75 | Accuracy: 0.96 | Loss: 16.00
Epoch: 80 | Accuracy: 0.96 | Loss: 14.49
Epoch: 85 | Accuracy: 0.96 | Loss: 13.13
Epoch: 90 | Accuracy: 0.96 | Loss: 11.89
Epoch: 95 | Accuracy: 0.96 | Loss: 10.77
Epoch: 100 | Accuracy: 0.96 | Loss: 9.75
Epoch: 105 | Accuracy: 0.96 | Loss: 8.83
Epoch: 110 | Accuracy: 0.96 | Loss: 7.99
Epoch: 115 | Accuracy: 0.96 | Loss: 7.24
Epoch: 120 | Accur

In [455]:
print(len(pred_df[pred_df["positive_prob"]>0.1]))
print(len(pred_df[pred_df["negative_prob"]>0.1]))

530
1134


<img src ="./data/tensor_flow_sentiment/sentiment-analysis-tensorflow.png"/>

<img src="./data/tensor_flow_sentiment/accuracy.png"/>

<img src="./data/tensor_flow_sentiment/loss-function.png"/>

## Negative predictions

List of tweets that are negative

In [481]:
pred_df[pred_df["negative_prob"] >= pred_df["positive_prob"]]

Unnamed: 0,negative_prob,positive_prob,word
3,1.000000,1.431383e-14,"Have to say, I hate Paris Hilton's behavior bu..."
6,1.000000,1.641543e-10,considering most Geico commericals are stupid....
7,0.582312,4.176876e-01,"i liked MIT though, esp their little info book(\n"
20,1.000000,1.971608e-13,i hate london bugs.\n
21,1.000000,1.010839e-10,Way to go stupid Lakers..\n
22,0.936398,6.360161e-02,london sucks....\n
24,0.999999,1.147597e-06,buy quite a few food to back to notts to eat l...
26,1.000000,2.838009e-14,I reallllllly hate Tom Cruise...\n
27,0.788669,2.113310e-01,"To my understanding, Harvard is a very difficu..."
28,1.000000,2.109162e-10,Boston can suck my fucking tits...\n


## Positive predictions

List of tweets that are positive

In [467]:
pred_df[pred_df["negative_prob"] <= pred_df["positive_prob"]]


Unnamed: 0,negative_prob,positive_prob,word
0,9.988532e-06,0.999990,""" I don't care what anyone says, I like Hillar..."
1,4.244269e-02,0.957557,have an awesome time at purdue!..\n
2,6.055163e-06,0.999994,"Yep, I'm still in London, which is pretty awes..."
4,5.110334e-13,1.000000,i will love the lakers.\n
5,9.192532e-15,1.000000,"I'm so glad I love Paris Hilton, too, or this ..."
9,1.446969e-10,1.000000,I still like Tom Cruise.\n
11,5.110334e-13,1.000000,i love angelina jolie.\n
12,1.446969e-10,1.000000,I still like Tom Cruise.\n
13,2.046067e-06,0.999998,UCLA is beautiful.\n
15,2.046067e-06,0.999998,Angelina Jolie is beautiful.\n


#### Data Source <br\>  

In [None]:
Michi