# Sentiment Analysis 



 

In [16]:
import numpy as np
import nltk
import string
import pandas as pd
import re
from nltk.classify import NaiveBayesClassifier
from sklearn.model_selection import train_test_split



In [17]:
data = pd.read_csv('data.csv', encoding='latin')



In [18]:
data.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'tweet']
data.head()

Unnamed: 0,sentiment,id,date,query,user_id,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [19]:
# we drop useless infos
data = data.drop(['id', 'date', 'query', 'user_id'], axis=1)
tweets = data['tweet']
sentiment = data['sentiment'] 
data.head()

Unnamed: 0,sentiment,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [20]:
lab_to_sentiment = {0:"Negative", 2:"Neutral", 4:"Positive"}
def label_decoder(label):
  return lab_to_sentiment[label]
data.sentiment = data.sentiment.apply(lambda x: label_decoder(x))

data.head()

Unnamed: 0,sentiment,tweet
0,Negative,is upset that he can't update his Facebook by ...
1,Negative,@Kenichan I dived many times for the ball. Man...
2,Negative,my whole body feels itchy and like its on fire
3,Negative,"@nationwideclass no, it's not behaving at all...."
4,Negative,@Kwesidei not the whole crew


## Data Preprocessing 

In [24]:
#Remove all newlines from inside a string
clean_data = [tweet.replace('\n','').strip() for tweet in tweets]
#To remove all whitespaces in the beginning and end of the string
#remove the unicodes for the single left and right quote characters
clean_data[:] = [tweet.replace(u'\u2018',"'").replace(u'\u2019',"'") for tweet in clean_data] 

#convert n't to  not
clean_data[:] = [tweet.replace('n\'t',' not') for tweet in clean_data]  

#remove any sub-string containing 'http'
clean_data[:] = [re.sub(r"^.*http.*$", '', tweet) for tweet in clean_data] 

#remove non-ASCII characters
clean_data[:] = [re.sub(r'[^\x00-\x7F]+','', tweet) for tweet in clean_data] 

clean_data[:] = [re.sub("(@[A-Za-z0–9]+)",'',tweet) for tweet in clean_data ]

#remove tweeter's RT' tags
clean_data[:] = [tweet.replace('RT','') for tweet in clean_data] 

#make all words lower case
clean_data[:] = [tweet.lower() for tweet in clean_data] 



clean_data

['is upset that he ca not update his facebook by texting it... and might cry as a result  school today also. blah!',
 ' i dived many times for the ball. managed to save 50%  the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 " no, it's not behaving at all. i'm mad. why am i here? because i ca not see you all over there.",
 ' not the whole crew',
 'need a hug',
 " hey  long time no see! yes.. rains a bit ,only a bit  lol , i'm fine thanks , how's you ?",
 '_k nope they did not have it',
 ' que me muera ?',
 "spring break in plain city... it's snowing",
 'i just re-pierced my ears',
 ' i could not bear to watch it.  and i thought the ua loss was embarrassing . . . . .',
 '16 it it counts, idk why i did either. you never talk to me anymore',
 " i would've been the first, but i did not have a gun.    not really though, zac snyder's just a doucheclown.",
 ' i wish i got to watch it with you!! i miss you and   how was the premiere?!',
 "hollis' death scene will h

In [25]:
#remove useless words that are common in this dataset dont affect the "positivity" of the tweet 
useless = nltk.corpus.stopwords.words("english") + list(string.punctuation) 


In [26]:
#tokenize and clean up the tweets 
tweets = []
for tweet in clean_data:
    wordlist = [word for word in nltk.word_tokenize(tweet) if word not in useless] #a list of words per tweet
    tweets.append(wordlist)
tweets[0] 

['upset',
 'ca',
 'update',
 'facebook',
 'texting',
 '...',
 'might',
 'cry',
 'result',
 'school',
 'today',
 'also',
 'blah']

### Text Stemming

In [27]:
#stemming
st = nltk.stem.SnowballStemmer('english')
tweets_stemmed = []
for words in tweets:
    stemmed_words = [st.stem(word) for word in words]
    tweets_stemmed.append(stemmed_words)

tweets[:] = tweets_stemmed

In [28]:
posneg=pd.Series(sentiment).value_counts()
posneg

4    800000
0    799999
Name: sentiment, dtype: int64

In [43]:
def build_bow_features(words):
    return {word:True for word in words}

In [55]:

#remove those neutral tweets as I am only interested in neg / pos ones
text_label_pair_list = list(zip(tweets,sentiment))
text_label_pair_list[0]
text_label_pair_list[:] = [tuple for tuple in text_label_pair_list if tuple[1]!='Neutral']

train, test = train_test_split(text_label_pair_list, test_size = .20, random_state=7)

In [56]:
#build a list of tuples (BOW_dict, label) for all tweets
train_bow = [(build_bow_features(tuple[0]), tuple[1]) for tuple in train]
test_bow = [(build_bow_features(tuple[0]), tuple[1]) for tuple in test]



In [57]:
def preprocess(sentence):
    def build_bow_features(words):
        return {word:True for word in words}
    
    sentence = sentence.lower()
    sentence = sentence.replace('\n','')
    useless = nltk.corpus.stopwords.words("english") + list(string.punctuation)
    wordlist = [word for word in nltk.word_tokenize(sentence) if word not in useless]
    stemmed_words = [nltk.stem.SnowballStemmer('english').stem(word) for word in wordlist]
    Bow = [(build_bow_features(stemmed_words))]
    return Bow

preprocess("this is very beautiful and happy ")

[{'beauti': True, 'happi': True}]

In [58]:
train_bow


[({'lol': True,
   'get': True,
   "'s": True,
   'come': True,
   'good': True,
   'way': True,
   'mean': True,
   'talent': True,
   'author': True},
  4),
 ({'would': True, 'miss': True, 'dream': True, 'anymor': True}, 0),
 ({}, 4),
 ({'im': True,
   'sick': True,
   'thing': True,
   'would': True,
   'make': True,
   'feel': True,
   'better': True,
   'palm': True,
   'pre': True,
   'came': True,
   'today': True,
   'lotteri': True,
   '...': True},
  0),
 ({'sound': True, 'like': True, 'cup': True, 'tea': True, 'sign': True}, 4),
 ({'think': True,
   "'m": True,
   'go': True,
   'take': True,
   'hot': True,
   'bath': True,
   'listen': True,
   'lvatt': True},
  4),
 ({'feel': True, 'tire': True, 'today': True, 'go': True, 'gym': True}, 0),
 ({'weekend': True, '...': True, 'rest': True, 'peac': True, '..final': True},
  4),
 ({}, 0),
 ({'say': True, 'll': True, 'cri': True, 'hahah': True}, 0),
 ({'miss': True, 'certain': True, 'someon': True}, 0),
 ({'hi': True,
   'lia': 

In [59]:
print(len(train_bow),len(test_bow))


1279999 320000


In [60]:
train_bow

[({'lol': True,
   'get': True,
   "'s": True,
   'come': True,
   'good': True,
   'way': True,
   'mean': True,
   'talent': True,
   'author': True},
  4),
 ({'would': True, 'miss': True, 'dream': True, 'anymor': True}, 0),
 ({}, 4),
 ({'im': True,
   'sick': True,
   'thing': True,
   'would': True,
   'make': True,
   'feel': True,
   'better': True,
   'palm': True,
   'pre': True,
   'came': True,
   'today': True,
   'lotteri': True,
   '...': True},
  0),
 ({'sound': True, 'like': True, 'cup': True, 'tea': True, 'sign': True}, 4),
 ({'think': True,
   "'m": True,
   'go': True,
   'take': True,
   'hot': True,
   'bath': True,
   'listen': True,
   'lvatt': True},
  4),
 ({'feel': True, 'tire': True, 'today': True, 'go': True, 'gym': True}, 0),
 ({'weekend': True, '...': True, 'rest': True, 'peac': True, '..final': True},
  4),
 ({}, 0),
 ({'say': True, 'll': True, 'cri': True, 'hahah': True}, 0),
 ({'miss': True, 'certain': True, 'someon': True}, 0),
 ({'hi': True,
   'lia': 

## Train the model 

In [69]:
sentiment_classifier = NaiveBayesClassifier.train(train_bow)

In [70]:
accuracy=nltk.classify.util.accuracy(sentiment_classifier, train_bow)*100

In [74]:
print(accuracy)

80.223733768542


In [64]:
import pickle

In [65]:
# Save the Modle to file in the current working directory

Pkl_Filename = "SentimentAnalysisModel2.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(sentiment_classifier, file)