# Sentiment Analysis using twitter data

Simple Deep Neural Network, that predicts tweet sentiment. The model can be greatly improved using various <br/>
word vectorization techniques and Recurrent Neural Networks.


In [228]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
import tflearn as tflearn
from nltk.corpus import stopwords
import preprocessor as p
import re
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

In [229]:

test_path = "./data/twitter_test.txt"
training_path = "./data/twitter_data.txt"

def all_stop_words():
    stop_words = stopwords.words('english')
    add_stopwords = [",", "*" , ")" , "(" ,".","theres","know","one","though","vinci","ive","da","book","im","went",
                    "potter","brokeback","mountain","harry","code","mission","impossible","movie","movies","i","ya",
                    "yet","yall"]
    for w in add_stopwords:
        stop_words.append(w)
    return stop_words


stop_words = all_stop_words()
def training_data_df(path):
    
    training_data = open(path,mode='r')
    training_array =[]
    for d in training_data:
        training_dict = {}
        sent_tweet_array = d.split('\t')
        training_dict['tweet'] = sent_tweet_array[1].lower()
        training_dict['cleaned_tweet'] =remove_stop_words(sent_tweet_array[1].lower())
        training_dict['positive'] =int(sent_tweet_array[0])
        training_array.append(training_dict)
    training_df = pd.DataFrame(training_array)
    return training_df

def sentiment(sentiment_prob):
    if sentiment_prob[0]>sentiment_prob[1]:
        return "positive"
    return "negative"

def remove_stop_words(tweet_text):
    tweet_text = re.sub(r'[?|$|.|!]',r'',tweet_text)
    tweet_text = re.sub(r'[^a-zA-Z0-9 ]',r'',tweet_text)
    result = ""
    for word in tweet_text.split():            
        if word not in stop_words:
            result = result +" "+word.lower()

    return result.lstrip()

def get_word_frequency(tweet_list):
    word_dict = {} 
    result_array=[]
    for tw in tweet_list:
        for word in tw.split():
            if word in word_dict.keys():
                word_dict[word] = word_dict[word]+1
                continue
            word_dict[word] = 1
    for k in word_dict.keys():
        result_array.append({'word':k, 'count':word_dict[k]})
    return pd.DataFrame(result_array,columns=['count','word'])


def common_pos_words(df_data):
    return common_words_filter(df_data,1)

def common_neg_words(df_data):
    return common_words_filter(df_data,0)

def common_words_filter(df_data,sent):
    pos_tweet =df_data[df_data["positive"] ==sent]
    all_tweets = pos_tweet["tweet"].str.cat(sep='|')
    common_words=[]
    for pos_tw in pos_tweet["tweet"]:
        for tw in all_tweets.split('|'):
            if pos_tw == tw:
                continue
            tw_list = list(tw.split())
            pos_list = list(pos_tw.split())
            inter = list(set(tw_list).intersection(pos_list))
            for word in inter:
                if word not in common_words:
                    common_words.append(word)
    return common_words

In [230]:
training_data = training_data_df(training_path)
# test_data = training_data_df(test_path)
training_data["neg"] = training_data["positive"].apply(lambda x: 0 if x==1 else 1)
vect = CountVectorizer(stop_words=all_stop_words(),binary=False)
print("All data:",training_data.shape[0])
# print("Test data:",test_data.shape[0])

All data: 7087


In [231]:
training_data = training_data[['positive','neg','cleaned_tweet', 'tweet']]
training_data.values

array([[0, 1, 'feeel like aaa reetard', 'i feeel like aaa reetard\n'],
       [1, 0, 'awesome', 'the da vinci code book is just awesome.\n'],
       [1, 0,
        'first clive cussler ever read even books like relic plausible',
        "this was the first clive cussler i've ever read, but even books like relic, and da vinci code were more plausible than this.\n"],
       ..., 
       [0, 1, 'sit watching mtv awards reminded much despised',
        'as i sit here, watching the mtv movie awards, i am reminded of how much i despised the movie brokeback mountain.\n'],
       [0, 1, 'ok horrible',
        'ok brokeback mountain is such a horrible movie.\n'],
       [0, 1, 'oh terrible',
        'oh, and brokeback mountain was a terrible movie.\n']], dtype=object)

### One-hot Encode

In [232]:
vect.fit(training_data["cleaned_tweet"])
word_matrix = vect.transform(training_data["cleaned_tweet"]).toarray()
print(word_matrix.shape)

pd.DataFrame(word_matrix, columns=vect.get_feature_names())


(7087, 2072)


Unnamed: 0,00,007,10,109,10pm,12,16,1984,1brokeback,1st,...,yesterday,yip,youand,young,younger,youre,yuck,yuh,zach,zen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [233]:
num_features = word_matrix.shape[1]
print("Total features: ",num_features)
classes = training_data.values[:,:2]

Total features:  2072


In [234]:
trX,testX,trY,testY = train_test_split(word_matrix,classes)   
print("Training data:",trX.shape[0])
print("Test data:",testX.shape[0])

Training data: 5315
Test data: 1772


In [235]:
trX = trX.astype(float)

print(vect.inverse_transform(trX))

# Deep Neural Network


Consists of : 
<ul>
<li>Input layer </li> 
<li>Hidden layer with 300 neurons</li> 
<li>Output softmax layer, which outputs the likelyhood of a class: [1,0]</li>
</ul>



<img src="./data/nn-architecture.png"/>

In [292]:
tf.reset_default_graph()
input_data = tf.placeholder('float32',shape=[None,num_features],name="input_data")
class_output = tf.placeholder('float32',shape=[None,2])

n_neurons = num_features

with tf.name_scope("network_info") as scope:
    hidden_layer1 = {
        'weights':tf.cast(
            tf.Variable(
                tf.truncated_normal(
                    name='layer1_weights',
                    shape=[num_features,n_neurons])
            ),tf.float32),

        'bias':tf.cast(
            tf.Variable(tf.zeros(
                    name='layer1_bias',
                    shape=[n_neurons])
                       ),
            tf.float32)
    }
    tf.summary.histogram("weights", hidden_layer1["weights"])
    tf.summary.histogram("bias", hidden_layer1["bias"])

layer1_logit =tf.add(
tf.matmul(
    input_data,
    hidden_layer1['weights']
),
hidden_layer1['bias'],
name = "layer1_logit"
)

sigmoid_layer1 = tf.sigmoid(layer1_logit,name='sigmoid_layer1')

output_weight = tf.Variable(
tf.truncated_normal(
    [n_neurons, 2],                        
    stddev=0.1)
)

output_bias = tf.Variable(tf.zeros([2]))
output_logit =tf.add(
tf.matmul(
    sigmoid_layer1,
    output_weight),
output_bias
)

predicted_output = tf.nn.softmax(output_logit,name="softmax")

with tf.name_scope("cost_function") as scope:
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        logits=output_logit,
        labels=class_output,
        name = "cross_entropy"

    )
    cross_entropy = tf.reduce_mean(cross_entropy)*100
    cost_summary_OP = tf.summary.scalar("loss", cross_entropy)
            
            
            
correct_prediction = tf.equal(
    tf.argmax(predicted_output, 1), 
    tf.argmax(class_output, 1)
)

with tf.name_scope("accuracy") as scope:
    accuracy = tf.reduce_mean(
        tf.cast(correct_prediction, tf.float32),
        name="acc")
    accuracy_summary_OP = tf.summary.scalar("accuracy", accuracy)


with tf.name_scope("training") as scope:
    learning_rate = 0.0003
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)

all_summary_OPS = tf.summary.merge_all()


init_op = tf.global_variables_initializer()
epochs =20
with tf.Session() as sess:
    sess.run(init_op)
    prediction = nn_model(input_data)
    # OLD VERSION:
    #cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(prediction,y) )
    # NEW:
#     cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
#     optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    saver = tf.train.Saver()
    writer = tf.summary.FileWriter("./tensorflow/sentiment", sess.graph_def)
    for i in range(epochs):        
        sess.run([train_step],
                        feed_dict={
                            input_data:trX,
                            class_output:trY
            })
        if i % 5 != 0:continue
            

        print("epoch:",i)
        acc,summary,loss= sess.run([
            accuracy,
            all_summary_OPS,
            cross_entropy], 
            feed_dict={
                input_data:trX,
                class_output:trY
            })
        print("acc:",acc)
        print("loss:",loss)
        
        writer.add_summary(summary, i)
        

    print("final accuracy on test set: %s" %str(sess.run(accuracy, 
                                                     feed_dict={
                                                            input_data:testX,
                                                            class_output: testY
                })))

    saver.save(sess, "./tensorflow/sentiment/model.ckpt")
    
    

epoch: 0
acc: 0.50254
loss: 100.53


KeyboardInterrupt: 

In [286]:
tf.reset_default_graph()
checkpoint_file=tf.train.latest_checkpoint('./tensorflow/sentiment/')
with tf.Session() as sess:
    saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
    saver.restore(sess,checkpoint_file)
    model = sess.graph.get_operation_by_name("softmax")
    input_data =sess.graph.get_operation_by_name("input_data")
    print(model.inputs)
    sess.run(tf.global_variables_initializer())

    for t in training_data[training_data["positive"] ==1]["tweet"].head(10):
        print("Tweet:",t)
        enc = vect.transform([t]).toarray().astype(float)
#         print(enc[0].shape)
        p = sess.run(model,feed_dict={input_data : enc})
        print(p)
#         print(np.asarray(p)[0])
#         print("sentiment:",sentiment(np.asarray(p)[0]))
#         print('--------')

<tensorflow.python.framework.ops.Operation._InputList object at 0x1b3d685f8>
Tweet: the da vinci code book is just awesome.



TypeError: Cannot interpret feed_dict key as Tensor: Can not convert a Operation into a Tensor.

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt

def auc_curve():
    test_y=testY[:,0]
    pred = []
    i=0
    for t in testX:
        p = model.predict([t])
        p_toarray = np.asarray(p)[0]
        if p_toarray[0]>p_toarray[1]:
            pred.append(1.0)
        else:
            pred.append(0.0)
    print(metrics.classification_report(test_y.astype(float), np.asarray(pred)))
    print(metrics.confusion_matrix(test_y.astype(float), pred))
    fpr, tpr, thresholds = metrics.roc_curve(test_y, np.asarray(pred))
    roc_auc = auc(fpr, tpr)
    print('AUC = %0.4f'% roc_auc)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Sentiment Rate')
    plt.xlabel('False Positive Sentiment Rate')
    plt.show()
    

auc_curve()

# Inspecting training data

### Positive sentiment

In [None]:
for t in training_data[training_data["positive"] ==1]["tweet"].head(10):
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

### Negative sentiment

In [None]:

for t in training_data[training_data["positive"] ==0]["tweet"].head(10):
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

### Custom data

In [None]:
# testing_file = open(test_path,mode='r')
# testing_tweets=[]
# for x in testing_file:
#     s= remove_stop_words(x)
#     testing_tweets.append(s)
# testing_tweets=["It’s likely something has gone wrong with the product or experience you’re reviewing that isn’t \
# representative of what the company offers 99.9 per cent of the time", 
#                 "If you think they have a point, you’re probably best taking it down immediately",
#                "I was surprised that the material was a rough burlap instead of the linen listed in the ad. I do \
#                 still like \
#                 the way it looks as it's decorative so feel isn't that important to me anyways."]

testing_tweets=["Besides the tasty food, the service is incredible!",
                "Horrible service. Absolutely no sense of customer service and how to take an order. \
                Your order will never reach you.","Good food but very slow service, no matter what time of day you come. Even if you order over phone still prepare to wait at least 30 mnts or more."]

for t in testing_tweets:
    print("Tweet:",t)
    enc = vect.transform([t]).toarray()
    p = model.predict(enc)
    print(np.asarray(p)[0])
    print("sentiment:",sentiment(np.asarray(p)[0]))
    print('--------')

In [None]:
print(vect.transform(["satisfied"]))