# Sentiment Analysis 



 

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import requests
import nltk
import string
import re
import os
from os import path
from time import sleep
from collections import Counter
from nltk.classify import NaiveBayesClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob 
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

In [207]:
data = pd.read_csv('Sentiment.csv', encoding='utf-8')
tweets = data['text']
sentiment = data['sentiment'] 
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


## Data Preprocessing 

In [208]:
#Remove all newlines from inside a string
clean_data = [tweet.replace('\n','').strip() for tweet in tweets]
#To remove all whitespaces in the beginning and end of the string
#remove the unicodes for the single left and right quote characters
clean_data[:] = [tweet.replace(u'\u2018',"'").replace(u'\u2019',"'") for tweet in clean_data] 

#convert n't to  not
clean_data[:] = [tweet.replace('n\'t',' not') for tweet in clean_data]  

#remove any sub-string containing 'http'
clean_data[:] = [re.sub(r"^.*http.*$", '', tweet) for tweet in clean_data] 

#remove non-ASCII characters
clean_data[:] = [re.sub(r'[^\x00-\x7F]+','', tweet) for tweet in clean_data] 

#remove tweeter's RT' tags
clean_data[:] = [tweet.replace('RT','') for tweet in clean_data] 

#make all words lower case
clean_data[:] = [tweet.lower() for tweet in clean_data] 

clean_data

[' @nancyleegrahn: how did everyone feel about the climate change question last night? exactly. #gopdebate',
 '',
 ' @tjmshow: no mention of tamir rice and the #gopdebate was held in cleveland? wow.',
 " @robgeorge: that carly fiorina is trending -- hours after her debate -- above any of the men in just-completed #gopdebate says she's on ",
 '',
 ' @gregabbott_tx: @tedcruz: "on my first day i will rescind every illegal executive action taken by barack obama." #gopdebate @foxnews',
 '',
 'going on #msnbc live with @thomasaroberts around 2 pm et.  #gopdebate',
 'deer in the headlights  @lizzwinstead: ben carson, may be the only brain surgeon who has performed a lobotomy on himself. #gopdebate',
 '',
 '@jgreendc @realdonaldtrump in all fairness #billclinton owns that phrase.#gopdebate',
 '',
 '',
 '',
 " @pattonoswalt: i loved scott walker as mark harmon's romantic rival in summer school. look it up. #gopdebate",
 "hey @chrischristie exploiting the tragedy of 9/11 for your own political g

In [21]:
#remove useless words that are common in this dataset dont affect the "positivity" of the tweet 
useless = nltk.corpus.stopwords.words("english") + list(string.punctuation) + ['``', "''",'gop','debate','gopdeb','gopdebate','gopdebates','fox','news','foxnew','foxnews', 'amp']


In [22]:
#tokenize and clean up the tweets 
tweets = []
for tweet in clean_data:
    wordlist = [word for word in nltk.word_tokenize(tweet) if word not in useless] #a list of words per tweet
    tweets.append(wordlist)
tweets[0] 

['nancyleegrahn',
 'everyone',
 'feel',
 'climate',
 'change',
 'question',
 'last',
 'night',
 'exactly']

### Text Stemming

In [23]:
#stemming
st = nltk.stem.SnowballStemmer('english')
tweets_stemmed = []
for words in tweets:
    stemmed_words = [st.stem(word) for word in words]
    tweets_stemmed.append(stemmed_words)

tweets[:] = tweets_stemmed

### Visualise Positives vs. Negatives Ratio

In [24]:
posneg=pd.Series(sentiment).value_counts()
posneg

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [25]:
def build_bow_features(words):
    return {word:True for word in words}

In [48]:

#remove those neutral tweets as I am only interested in neg / pos ones
text_label_pair_list = list(zip(tweets,sentiment))
text_label_pair_list[0]
text_label_pair_list[:] = [tuple for tuple in text_label_pair_list if tuple[1]!='Neutral']

train, test = train_test_split(list(zip(tweets,sentiment)), test_size = .1, random_state=7)
train

[([], 'Negative'),
 (['jjauthor',
   'polici',
   'barack',
   'obama',
   'manag',
   'provid',
   'neither',
   'peac',
   'prosper',
   'realdonaldtrump',
   'tedcruz'],
  'Negative'),
 (['dougstanhop',
   'hillari',
   'come',
   'everi',
   'angl',
   'strike',
   'ground',
   'pound',
   'arm',
   'bar',
   "'s",
   'unstopp',
   'think',
   'wr'],
  'Neutral'),
 (['lrihendri',
   'tedcruz',
   'presid',
   'alway',
   'tell',
   'truth',
   'said',
   'would'],
  'Neutral'),
 (['lukewearechang', 'evid', 'hack', 'talk', 'come', 'syria'], 'Negative'),
 ([], 'Negative'),
 (['tatianak',
   'fact',
   'wasnt',
   'even',
   'liter',
   'pundit',
   'say',
   'rememb',
   'time',
   'said',
   'x',
   'bish',
   'bout'],
  'Negative'),
 ([], 'Negative'),
 (['rwsurfergirl', 'thank', 're', 'rais', 'realdonaldtrump', "'s", 'rate'],
  'Negative'),
 (['order',
   'cruz',
   'rubio',
   'carson',
   'walker',
   'think',
   'rand',
   'okay',
   'trump',
   'hurt',
   'jeb',
   'much'],
  '

In [27]:
#build a list of tuples (BOW_dict, label) for all tweets
train_bow = [(build_bow_features(tuple[0]), tuple[1]) for tuple in train]
test_bow = [(build_bow_features(tuple[0]), tuple[1]) for tuple in test]

In [28]:
print(len(train_bow),len(test_bow))


12483 1388


In [29]:
train_bow

[({}, 'Negative'),
 ({'jjauthor': True,
   'polici': True,
   'barack': True,
   'obama': True,
   'manag': True,
   'provid': True,
   'neither': True,
   'peac': True,
   'prosper': True,
   'realdonaldtrump': True,
   'tedcruz': True},
  'Negative'),
 ({'dougstanhop': True,
   'hillari': True,
   'come': True,
   'everi': True,
   'angl': True,
   'strike': True,
   'ground': True,
   'pound': True,
   'arm': True,
   'bar': True,
   "'s": True,
   'unstopp': True,
   'think': True,
   'wr': True},
  'Neutral'),
 ({'lrihendri': True,
   'tedcruz': True,
   'presid': True,
   'alway': True,
   'tell': True,
   'truth': True,
   'said': True,
   'would': True},
  'Neutral'),
 ({'lukewearechang': True,
   'evid': True,
   'hack': True,
   'talk': True,
   'come': True,
   'syria': True},
  'Negative'),
 ({}, 'Negative'),
 ({'tatianak': True,
   'fact': True,
   'wasnt': True,
   'even': True,
   'liter': True,
   'pundit': True,
   'say': True,
   'rememb': True,
   'time': True,
   's

## Train the model 

In [30]:
sentiment_classifier = NaiveBayesClassifier.train(train_bow)

In [31]:
nltk.classify.util.accuracy(sentiment_classifier, train_bow)*100

71.41712729311864

In [46]:
X_train = train_bow
y_train = train_bow
X_train

[({}, 'Negative'),
 ({'jjauthor': True,
   'polici': True,
   'barack': True,
   'obama': True,
   'manag': True,
   'provid': True,
   'neither': True,
   'peac': True,
   'prosper': True,
   'realdonaldtrump': True,
   'tedcruz': True},
  'Negative'),
 ({'dougstanhop': True,
   'hillari': True,
   'come': True,
   'everi': True,
   'angl': True,
   'strike': True,
   'ground': True,
   'pound': True,
   'arm': True,
   'bar': True,
   "'s": True,
   'unstopp': True,
   'think': True,
   'wr': True},
  'Neutral'),
 ({'lrihendri': True,
   'tedcruz': True,
   'presid': True,
   'alway': True,
   'tell': True,
   'truth': True,
   'said': True,
   'would': True},
  'Neutral'),
 ({'lukewearechang': True,
   'evid': True,
   'hack': True,
   'talk': True,
   'come': True,
   'syria': True},
  'Negative'),
 ({}, 'Negative'),
 ({'tatianak': True,
   'fact': True,
   'wasnt': True,
   'even': True,
   'liter': True,
   'pundit': True,
   'say': True,
   'rememb': True,
   'time': True,
   's

In [42]:
from sklearn.neural_network import MLPClassifier

SK = MLPClassifier(alpha=0.1,hidden_layer_sizes=(10), random_state=1)
SK.fit(X_train, Y_train)

ValueError: Expected 2D array, got 1D array instead:
array=[{} 'Negative'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [40]:
import keras
from keras.utils.np_utils import to_categorical


train_bow = to_categorical(y_train)
train[:3]

TypeError: int() argument must be a string, a bytes-like object or a number, not 'dict'

In [35]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras import optimizers

N = train_bow[1]
H = 100
K = 10

model = Sequential()
model.add(Dense(H, input_dim=N))
model.add(Activation("tanh"))
model.add(Dense(K))
model.add(Activation("softmax"))

model.compile(optimizer=optimizers.SGD(lr=0.1),
              loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_bow[0], train_bow[1], epochs=15, batch_size=32);

TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an __index__ method, got value '({'jjauthor': True, 'polici': True, 'barack': True, 'obama': True, 'manag': True, 'provid': True, 'neither': True, 'peac': True, 'prosper': True, 'realdonaldtrump': True, 'tedcruz': True}, 'Negative')' with type '<class 'tuple'>'.