## Sentiment Analysis
In this lab we will apply the Naive Bayes classifier for the sentiment analysis purposes.

In [1]:
import re, string
import math
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Defining all the function to read data, extract features and traing the model

In [3]:
#Read the datasets
path = './data/'
filePrefix = 'training_'
categories=['POS','NEG']
dataset={}
dataset_raw = {}
allFeatures=set()
tot_articles = 0
articles_count={}

N={} # Number of posts in each corpus

for category in categories:
    fileName=path+filePrefix+category.lower()
    f=open(fileName,'r')
    text = ''
    text_raw = ''    
    lines=f.readlines()
    tot_articles+=len(lines)
    articles_count[category] = len(lines)
    dataset_raw[category] = list(map(lambda line: line.lower(), lines))
    
    for line in lines:
        text+=line.replace('\n',' ').lower()
        text_raw = line.lower()
    f.close
    N[category]=len(lines)
    
    tokens = nltk.word_tokenize(text)
    dataset[category] = nltk.Text(tokens)

In [5]:
print (N['POS'])
print (N['NEG'])
print (tot_articles)

990
991
1981


### Applying basic pre-processing

In [8]:
dataset_clean={}

def apply_stopwording(corpus, min_len):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_corpus = [token for token in corpus if (not token in stopwords.words('english') and len(token)>min_len) and (not token in punctuations)]
    return filtered_corpus

#Let's remove punctuation characters and apply stopwording
for category in categories:
    print ('Processing %s' % category)
    dataset_clean[category] = apply_stopwording(dataset[category],3)
    print (dataset_clean[category])

Processing POS


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [10]:
dataset_final={}
all_words=set()

for category in categories:
    print ('Processing %s' % category)
    dataset_final[category]=dataset_clean[category]
    for token in dataset_final[category]:
        all_words.add(token)

Processing POS
Processing NEG


Calculating the probabilities for each category (you can define any number of categories)

In [11]:
feature_count = {}
category_count = {}
probCat = {}

# Calculate the probabilities for each category
for category in categories:
    probCat[category]=articles_count[category]*1.0/tot_articles
    print ("%s - p(%s)=%s" % (category,category,probCat[category]))

POS - p(POS)=0.49974760222110043
NEG - p(NEG)=0.5002523977788995


Calculating term probabilities 𝑝(𝑡|C) and 𝑝(𝑡)

In [12]:
freqWord = {}
wordCounts = {}

def buildFrequencies(data):
    for category in categories:
        freqWord[category] = FreqDist(data[category])
        wordCounts[category] = len(data[category])


#Generate frequencies: the implementation is very different from the first version
buildFrequencies(dataset_final)

In [16]:
word = 'stiller'
print ("Checking Frequencies for word '%s':" % word)
print ("F('%s'|'POS')=%s" % (word,freqWord['POS'][word]))
print ("F('%s'|'NEG')=%s" % (word,freqWord['NEG'][word]))

Checking Frequencies for word 'stiller':
F('stiller'|'POS')=43
F('stiller'|'NEG')=17


From frequencies to probabilities

In [17]:
def getTermProbability(word):
    count = 0
    total = 0
    for category in categories:
        total += wordCounts[category]
        if word in freqWord[category]:
            count+=freqWord[category][word]
    return count*1.0/total

def getTermCondProbability(word,category):
    count = 0
    total = wordCounts[category]

    if word in freqWord[category]:
        count=freqWord[category][word]
    else:
        #Apply Laplace Smoothing
        count=1
    
    return count*1.0/(total+len(all_words))


In [26]:
word = 'good'
print ("probability for word '%s' - p('%s')=%s" % (word,word,getTermProbability('team')))
print ("probability for word '%s' in POS - p('%s'|'POS')=%s" % (word,word,getTermCondProbability(word,'POS')))
print ("probability for word '%s' in NEG - p('%s'|'NEG')=%s" % (word,word,getTermCondProbability(word,'NEG')))

probability for word 'good' - p('good')=0.00046255813658771495
probability for word 'good' in POS - p('good'|'POS')=0.0030925825435042594
probability for word 'good' in NEG - p('good'|'NEG')=0.0032756721280735065


### Build Naive Bayes Classifier

Here we use the logs of probabilities instead of classic definition of Naive Bayes

In [69]:
def NaiveBayesClassifier(article):
    
    tokens = nltk.word_tokenize(article)
    text = nltk.Text(tokens)
    clean_text = apply_stopwording(remove_punctuation(text), 3)
    words = [w for w in clean_text]
    results={}
    for category in categories:
        pCat = probCat[category]
        pNumerator = 1.0
        idx = 1
        for word in words:
            pN = getTermCondProbability(word,category)
            #pNumerator = pNumerator * pN
            pNumerator+= math.log(pN)
            #print '[%s] p(%s)=%s' % (category, word,pN)
            #idx+=1
            #print '[%s][%s - %s] %s' % (idx,category, word, pNumerator)

        pClassification = pNumerator+math.log(pCat)
        results[category] = pClassification
        #print '[%s] p()=%s' % (category, pClassification)
    
    pMax = -10000000
    predictedClass = ''
    for category in categories:
        if results[category]>pMax:
            pMax = results[category]
            predictedClass = category

    #print ('The article has been assigned to class "%s" with a probability of %s' % (predictedClass,pMax))
    return predictedClass,pMax

In [70]:
article = "I think this business proposition is risky to say the least."
article = "I think this business proposition makes perfect sense."
print (NaiveBayesClassifier(article))

('NEG', -47.44494284739373)


Testing the Classif

In [71]:
def get_string_from_corpus(corpus):
    result=''
    for token in corpus:
        result+=token
        result+=' '
    return result[:-1]

f=open('./data/testing_sentiment.txt','r')
lines=f.readlines()
f.close

correct = 0
total = len(lines)
index = 1

TP=0.0
TN=0.0
FP=0.0
FN=0.0

F=0.0
precision = 0.0
recall = 0.0

test_articles = {}

for line in lines:
    elems = line.split('\t')
    article=elems[0]
    category=elems[1][:-1]

    #Clean up the article and apply normalization
    text = nltk.Text(nltk.word_tokenize(article.lower()))
    clean_article = get_string_from_corpus(apply_stopwording(remove_punctuation(text), 3))
    test_articles[clean_article]=category
    
    predictedCategory,pCategory = NaiveBayesClassifier(str(clean_article))
    
    print ('%s. Prediction[%s] Class[%s]' % (index,predictedCategory,category))
    index+=1
    
    #Calculating quality measures
    if (predictedCategory == category):
        correct+=1
        if (category == categories[1]):
            TP+=1
        else:
            TN+=1
    else:
        if (predictedCategory == categories[1]):
            FN+=1
        else:
            FP+=1

precision = TP/(TP+FP)
recall = TP/(TP+FN)
F=2*(precision*recall)/(precision+recall)

print ('\nThe classifer was correct %s out of %s or %s' % (correct,total,correct*1.0/total))
print ('precision=%s' % precision)
print ('recall=%s' % recall)
print ('F=%s' % F)

1. Prediction[POS] Class[POS]
2. Prediction[POS] Class[POS]
3. Prediction[POS] Class[POS]
4. Prediction[POS] Class[POS]
5. Prediction[POS] Class[POS]
6. Prediction[POS] Class[POS]
7. Prediction[POS] Class[POS]
8. Prediction[POS] Class[POS]
9. Prediction[POS] Class[POS]
10. Prediction[POS] Class[POS]
11. Prediction[NEG] Class[NEG]
12. Prediction[NEG] Class[NEG]
13. Prediction[NEG] Class[NEG]
14. Prediction[NEG] Class[NEG]
15. Prediction[NEG] Class[NEG]
16. Prediction[NEG] Class[NEG]
17. Prediction[NEG] Class[NEG]
18. Prediction[NEG] Class[NEG]
19. Prediction[NEG] Class[NEG]
20. Prediction[NEG] Class[NEG]

The classifer was correct 20 out of 20 or 1.0
precision=1.0
recall=1.0
F=1.0


## Can we use this trained Naive Bayes Classifier for our Tweets?
The idea here is to use a model trained on a different dataset and use it for Tweeters

In [74]:
import re
import nltk
import random
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet

def remove_utf(text):
    return re.sub(r'[^\x00-\x7f]',r' ',text)

def remove_tinyURL(text):
    return re.sub(r'http\S+',r'',text)

path = ".\\data\\Tweet-3000.txt"

tweets = ""
file_input = open (path,"r")
lines = file_input.readlines()
for line in lines:
    tweets += (remove_utf(line.lower()))
file_input.close()

print (len(tweets))

225980


In [41]:
tweet_tokenizer = TweetTokenizer()
tokens_raw = tweet_tokenizer.tokenize(tweets.lower())
tokens = []
for token in tokens_raw:
    if (token == " "):
        continue
    elif (token.startswith('http')):
        tokens.append('URL')
    elif (token.startswith('@')):
        tokens.append('USER_NAME')
    else:
        tokens.append(token)

In [42]:
def get_lemma(token):
    #Return the morphological variant of this word
    lemma = wordnet.morphy(token)

    if lemma is None:
        return token
    else:
        return lemma

lemmas = [get_lemma(token) for token in tokens]

In [45]:
stop_words= set(nltk.corpus.stopwords.words('english'))
tokens_clean = [token for token in lemmas if (len(token)>4 and token not in stop_words)]

In [46]:
def clean_tweets(text):
    tokens_raw = tweet_tokenizer.tokenize(text)
    tokens = []
    for token in tokens_raw:
        if (token == " " or token.startswith('http') or token.startswith('@')):
            continue
        else:
            tokens.append(token)
    
    lemmas = [get_lemma(token) for token in tokens]
    tokens_clean = [token for token in lemmas if (len(token)>4 and token not in stop_words)]
    return tokens_clean

In [91]:
dataset = []
file_input = open (path,"r")
lines = file_input.readlines()
for line in lines:
    dataset.append(remove_utf(line.lower()))
file_input.close()

tweets = [clean_tweets(tweet) for tweet in dataset]

In [92]:
totNeg = 0
totPos = 0
for tweet in tweets:
    #print (tweet)
    predictedCategory,pCategory = NaiveBayesClassifier(str(tweet))
    if (predictedCategory == "POS"):
        totPos = totPos + 1
        print ("POS -"+ tweet)
    else:
        totNeg = totNeg + 1
        r = random.randint(1,101)
        if r<=2:
            print ("NEG - %s" % tweet)
        
print ("Negative Tweeter: %s" % totNeg)
print ("Positive Tweeter: %s" % totPos)

Negative Tweeter: 2982
Positive Tweeter: 18
