## Importing NLTK and Data

In [3]:
from nltk.corpus import twitter_samples
import string
import re
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import nltk
from nltk.tokenize import TweetTokenizer
print (twitter_samples.fileids())
from nltk.corpus import stopwords # if can't import stopwords and twitter_samples use nltk.download('x') once before using them


['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [4]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets))
 
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets))
 
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets))
 
for tweet in pos_tweets[:5]:
    print (tweet)

5000
5000
20000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


### Tokenize tweets into words

In [5]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
for tweet in pos_tweets[:5]:
    print (tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


### Cleaning the Tweets

In [6]:
stopwords_english = stopwords.words('english')
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
emoticons = emoticons_happy.union(emoticons_sad)
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
 
custom_tweet = "RT @Twitter  Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
 
# print cleaned tweet
print (clean_tweets(custom_tweet))
print (pos_tweets[5])
print (clean_tweets(pos_tweets[5]))


['hello', 'great', 'day', 'good', 'morn']
@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
['one', 'irresist', 'flipkartfashionfriday']


### Vectorizing the words

In [7]:
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
custom_tweet = "RT @Twitter  Hello There! Have a great day. :) #good #morning http://aditya.in.np"
print (bag_of_words(custom_tweet))
'''
Output:
 
{'great': True, 'good': True, 'morning': True, 'hello': True, 'day': True}
'''
 
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
 
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
 
print (len(pos_tweets_set), len(neg_tweets_set)) 

{'hello': True, 'great': True, 'day': True, 'good': True, 'morn': True}
5000 5000


### Dividing into train/test set

In [53]:
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
 
print(len(test_set),  len(train_set))

2000 8000


### Train on set and check accuracy on test set

In [54]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) 
 
print (classifier.show_most_informative_features(10))  

0.722
Most Informative Features
                     via = True              pos : neg    =     39.7 : 1.0
                    glad = True              pos : neg    =     25.7 : 1.0
                     bam = True              pos : neg    =     20.3 : 1.0
                     x15 = True              neg : pos    =     19.7 : 1.0
                     sad = True              neg : pos    =     17.5 : 1.0
                   arriv = True              pos : neg    =     16.1 : 1.0
                      aw = True              neg : pos    =     14.2 : 1.0
               goodnight = True              pos : neg    =     13.0 : 1.0
                  welcom = True              pos : neg    =     12.8 : 1.0
                 appreci = True              pos : neg    =     12.2 : 1.0
None


### Confidence Percentage on a new tweet

In [48]:
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
res = classifier.classify(custom_tweet_set)
# Negative tweet correctly classified as negative
prob_result = classifier.prob_classify(custom_tweet_set)
print ("The text is classified as : ",res ) 
#print (prob_result.max()) 
print ("Confidence that it's negative : ",prob_result.prob("neg")*100, '%') 
print ("Confidence that it's positive : ",prob_result.prob("pos")*100, '%')
 
 
custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)
 
#print (classifier.classify(custom_tweet_set)) # Output: pos
# Positive tweet correctly classified as positive
res = classifier.classify(custom_tweet_set)
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
#
print ("The text is classified as : ",res ) 
#print (prob_result.max())
print ("Confidence that it's negative : ",prob_result.prob("neg")*100, '%') 
print ("Confidence that it's positive : ",prob_result.prob("pos")*100, '%')

The text is classified as :  neg
Confidence that it's negative :  87.41067724918298 %
Confidence that it's positive :  12.589322750816882 %
The text is classified as :  pos
Confidence that it's negative :  0.043688237368539044 %
Confidence that it's positive :  99.95631176263136 %


### Finding F-Score, Precision, Recall

In [49]:
from collections import defaultdict
 
actual_set = defaultdict(set)
predicted_set = defaultdict(set)
 
actual_set_cm = []
predicted_set_cm = []
 
for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
    predicted_label = classifier.classify(feature) 
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix
 
print ('pos precision:', precision(actual_set['pos'], predicted_set['pos'])) 
print ('pos recall:', recall(actual_set['pos'], predicted_set['pos'])) 
print ('pos F-measure:', f_measure(actual_set['pos'], predicted_set['pos']))
 
print ('neg precision:', precision(actual_set['neg'], predicted_set['neg']))
print ('neg recall:', recall(actual_set['neg'], predicted_set['neg']))
print ('neg F-measure:', f_measure(actual_set['neg'], predicted_set['neg'])) 

pos precision: 0.7380254154447703
pos recall: 0.755
pos F-measure: 0.7464162135442413
neg precision: 0.7492323439099283
neg recall: 0.732
neg F-measure: 0.74051593323217


In [56]:
cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print (cm)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<732>268 |
pos | 245<755>|
----+---------+
(row = reference; col = test)



In [57]:
print (cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <36.6%> 13.4% |
pos |  12.2% <37.8%>|
----+---------------+
(row = reference; col = test)

