# Improve Classification of web articles
In this lab we will improve the performance of our Naive Bayes classifier from the theory learned in class.

In [1]:
import nltk
from nltk import FreqDist
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Preprocessing the articles

Loading the corpus or articles

In [2]:
#Read the datasets
path = 'C:\\tmp\\'
filePrefix = 'training_'
categories=['ARTS','SPORTS']
dataset={}
dataset_raw = {}
allFeatures=set()
tot_articles = 0
articles_count={}

N={} # Number of articles in each corpus

for category in categories:
    fileName=path+filePrefix+category.lower()
    f=open(fileName,'r')
    text = ''
    text_raw = ''    
    lines=f.readlines()
    tot_articles+=len(lines)
    articles_count[category] = len(lines)
    dataset_raw[category] = list(map(lambda line: line.lower(), lines))
    
    for line in lines:
        text+=line.replace('\n',' ').lower()
        text_raw = line.lower()
    f.close
    N[category]=len(lines)
    
    tokens = nltk.word_tokenize(text)
    dataset[category] = nltk.Text(tokens)

Removing Punctuations & Stopwording

In [3]:
dataset_clean={}

def remove_punctuation(corpus):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_corpus = [token for token in corpus if (not token in punctuations)]
    return filtered_corpus

def apply_stopwording(corpus, min_len):
    filtered_corpus = [token for token in corpus if (not token in stopwords.words('english') and len(token)>min_len)]
    return filtered_corpus

#Let's remove punctuation characters and apply stopwording
for category in categories:
    print 'Processing %s' % category
    dataset_clean[category] = apply_stopwording(remove_punctuation(dataset[category]), 3)
    print dataset_clean[category]

Processing ARTS
['excerpts', 'interview', 'rapper', 'kendrick', 'lamar', 'discusses', 'critically', 'acclaimed', 'album', 'pimp', 'butterfly', 'recent', 'grammy', 'nominations', 'york', 'times', 'critics', 'manohla', 'dargis', 'scott', 'stephen', 'holden', 'list', 'picks', 'oscar-worthy', 'films', 'performances', '2015.', 'caramanica', 'profiles', 'blogger', 'stephen', 'carbone', 'posting', 'spoilers', 'reality', 'show', 'bachelor', 'past', 'four', 'years', 'website', 'realitysteve.com', 'reflects', 'show', 'begins', '20th', 'season', 'carbone', 'continues', 'information', 'sources', 'adam', 'mckay', 'director', 'co-writer', 'caper', 'movie', 'short', 'discusses', 'used', 'humor', 'explain', 'complicated', 'aspects', '2008', 'housing', 'banking', 'crisis', 'zachary', 'woolfe', 'offers', 'highlights', 'season', 'amazon', 'original', 'series', 'mozart', 'jungle', 'starring', 'gael', 'garcia', 'bernal', 'show', 'features', 'behind-the-scenes', 'look', 'semi-fictional', 'york', 'symphony',

Apply Stemming or Lemmatization

In [12]:
dataset_final={}
all_words=set()

def apply_stemming(corpus):
    stemmer = nltk.PorterStemmer()
    normalized_corpus = [stemmer.stem(token) for token in corpus]
    return normalized_corpus

def apply_lemmatization(corpus):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_corpus = [lemmatizer.lemmatize(token) for token in corpus]
    return normalized_corpus

for category in categories:
    print 'Processing %s' % category
    dataset_final[category] = apply_stemming(dataset_clean[category])
    print dataset_final[category]
    for token in dataset_final[category]:
        all_words.add(token)



Processing ARTS
[u'excerpt', u'interview', u'rapper', u'kendrick', u'lamar', u'discuss', u'critic', u'acclaim', u'album', u'pimp', u'butterfli', u'recent', u'grammi', u'nomin', u'york', u'time', u'critic', u'manohla', u'dargi', u'scott', u'stephen', u'holden', u'list', u'pick', u'oscar-worthi', u'film', u'perform', u'2015.', u'caramanica', u'profil', u'blogger', u'stephen', u'carbon', u'post', u'spoiler', u'realiti', u'show', u'bachelor', u'past', u'four', u'year', u'websit', u'realitysteve.com', u'reflect', u'show', u'begin', u'20th', u'season', u'carbon', u'continu', u'inform', u'sourc', u'adam', u'mckay', u'director', u'co-writ', u'caper', u'movi', u'short', u'discuss', u'use', u'humor', u'explain', u'complic', u'aspect', u'2008', u'hous', u'bank', u'crisi', u'zachari', u'woolf', u'offer', u'highlight', u'season', u'amazon', u'origin', u'seri', u'mozart', u'jungl', u'star', u'gael', u'garcia', u'bernal', u'show', u'featur', u'behind-the-scen', u'look', u'semi-fict', u'york', u'symph

Define all functions needed for the implementation of the Naive Bayes classifier (see "Classifying Web Articles Notebook")

In [5]:
feature_count = {}
category_count = {}
probCat = {}

# Calculate the probabilities for each category
for category in categories:
    probCat[category]=articles_count[category]*1.0/tot_articles
    print ("%s - p(%s)=%s" % (category,category,probCat[category]))

ARTS - p(ARTS)=0.5
SPORTS - p(SPORTS)=0.5


In [6]:
freqWord = {}
wordCounts = {}

def buildFrequencies(dataset):
    for category in categories:
        freqWord[category] = FreqDist(dataset[category])
        wordCounts[category] = len(dataset[category])


#Generate frequencies: the implementation is very different from the first version
buildFrequencies(dataset_final)

print ("Checking Frequencies for word 'team':")
print ("F('team'|'ARTS')=%s" % freqWord['ARTS']['team'])
print ("F('team'|'SPORTS')=%s" % freqWord['SPORTS']['team'])

Checking Frequencies for word 'team':
F('team'|'ARTS')=2
F('team'|'SPORTS')=25


In [7]:
def getTermProbability(word):
    count = 0
    total = 0
    for category in categories:
        total += wordCounts[category]
        if word in freqWord[category]:
            count+=freqWord[category][word]
    return count*1.0/total

def getTermCondProbability(word,category):
    count = 0
    total = wordCounts[category]

    if word in freqWord[category]:
        count=freqWord[category][word]
    else:
        #Apply Laplace Smoothing
        count=1.0/(len(freqWord[category])+len(allFeatures))
    
    return count*1.0/total
    
print ("probability for word 'actress' - p('actress')=%s" % getTermProbability('actress'))
print ("probability for word 'actress' in ARTS - p('actress'|'ARTS')=%s" % getTermCondProbability('actress','ARTS'))
print ("probability for word 'actress' in SPORTS - p('actress'|'SPORTS')=%s" % getTermCondProbability('actress','SPORTS'))

probability for word 'actress' - p('actress')=0.00296256396445
probability for word 'actress' in ARTS - p('actress'|'ARTS')=0.00559227249619
probability for word 'actress' in SPORTS - p('actress'|'SPORTS')=6.47893310113e-07


In [24]:
def NaiveBayesClassifier(article):
    words = [w for w in word_tokenize(article)]
    results={}
    for category in categories:
        pCat = probCat[category]
        pNumerator = 1.0
        for word in words:
            pN = getTermCondProbability(word,category)
            pNumerator*= pN
            #print '[%s] p(%s)=%s' % (category, word,pN)

        pClassification = pNumerator*pCat
        results[category] = pClassification
        #print '[%s] p()=%s' % (category, pClassification)
    
    pMax = 0.0
    predictedClass = ''
    for category in categories:
        if results[category]>pMax:
            pMax = results[category]
            predictedClass = category

    #print ('The article has been assigned to class "%s" with a probability of %s' % (predictedClass,pMax))
    return predictedClass

Testing the Classifier

In [29]:
def get_string_from_corpus(corpus):
    result=''
    for token in corpus:
        result+=token
        result+=' '
    return result[:-1]

f=open('C:\\tmp\\testing.txt','r')
lines=f.readlines()
f.close

correct = 0
total = len(lines)
index = 1

TP=0.0
TN=0.0
FP=0.0
FN=0.0

F=0.0
precision = 0.0
recall = 0.0

test_articles = {}

for line in lines:
    elems = line.split('\t')
    article=elems[0]
    category=elems[1][:-1]
    
    #Clean up the article and apply normalization
    text = nltk.Text(nltk.word_tokenize(article.decode('utf-8').lower()))
    clean_article = get_string_from_corpus(apply_stemming(apply_stopwording(remove_punctuation(text), 3)))
    test_articles[clean_article]=category
    
    predictedCategory = NaiveBayesClassifier(str(clean_article))
    
    print '%s. Prediction[%s] Class[%s]' % (index,predictedCategory,category)
    index+=1
    
    #Calculating quality measures
    if (predictedCategory == category):
        correct+=1
        if (category == categories[1]):
            TP+=1
        else:
            TN+=1
    else:
        if (predictedCategory == categories[1]):
            FN+=1
        else:
            FP+=1

precision = TP/(TP+FP)
recall = TP/(TP+FN)
F=2*(precision*recall)/(precision+recall)

print ('\nThe classifer was correct %s out of %s or %s' % (correct,total,correct*1.0/total))
print 'precision=%s' % precision
print 'recall=%s' % recall
print 'F=%s' % F

1. Prediction[SPORTS] Class[SPORTS]
2. Prediction[SPORTS] Class[SPORTS]
3. Prediction[ARTS] Class[SPORTS]
4. Prediction[SPORTS] Class[SPORTS]
5. Prediction[SPORTS] Class[SPORTS]
6. Prediction[SPORTS] Class[ARTS]
7. Prediction[ARTS] Class[ARTS]
8. Prediction[ARTS] Class[ARTS]
9. Prediction[SPORTS] Class[ARTS]
10. Prediction[SPORTS] Class[ARTS]

The classifer was correct 6 out of 10 or 0.6
precision=0.8
recall=0.571428571429
F=0.666666666667


## Comparing our results with the NLTK Naive Bayes Classifier

The input for the NLTK Naive Bayes Classifier is slightly different from what we used before and it is in the form of [(featureset,label)], with the featureset a dictionary and label is our category.

In [14]:
import collections

def get_featureset(corpus):
    dict=collections.defaultdict(list)
    result = []
    for category in categories:
        result.extend([({token:(token in dataset_final[category]) for token in all_words}, category)])
    return result

inputset = get_featureset(dataset_final)
print inputset

[({u'bad-boy': True, u'four': True, u'skeleton': True, u'whose': True, u'accus': False, u'reshuffl': False, u'sorri': False, u'ilya': True, u'edward': False, u'pride': False, u'void': True, u'rise': True, u'voic': True, u'tenni': False, u'govern': True, u'senil': True, u'school': True, u'tangerin': True, u'krakowski': True, u'reinvent': True, u'xylophonist': True, u'triumph': True, u'miller': False, u'direct': True, u'second': False, u'street': True, u'blue': False, u'aim': True, u'asif': True, u'introspect': True, u'orchestra': True, u'filmmak': True, u'net': False, u'maverick': False, u'specialist': True, u'henrik': False, u'met': False, u'studio': True, u'point-of-view': True, u'path': True, u'coen': True, u'tomsula': False, u'112-120': False, u'songwrit': True, u'jame': True, u'flyer': False, u'darlen': True, u'controversi': True, u'golden': True, u'119-113': False, u'campaign': True, u'newspap': False, u'julia': True, u'brought': False, u'madoff': True, u'unit': False, u'highli': 

Train the NLTK Naive Bayes Classifier

for the format look at http://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis

In [16]:
from nltk.classify import NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(inputset)
nb_classifier.labels()
nb_classifier.show_most_informative_features()

Most Informative Features


Classify our testset

In [34]:
index=1
for article in test_articles.keys():
    testset = {token: (token in article) for token in all_words}
    predictedCategory = nb_classifier.classify(testset)
    print '%s. Prediction[%s] Class[%s]' % (index,predictedCategory,test_articles[article])
    index+=1

1. Prediction[SPORTS] Class[ARTS]
2. Prediction[SPORTS] Class[SPORTS]
3. Prediction[SPORTS] Class[SPORTS]
4. Prediction[SPORTS] Class[ARTS]
5. Prediction[SPORTS] Class[SPORTS]
6. Prediction[SPORTS] Class[ARTS]
7. Prediction[SPORTS] Class[ARTS]
8. Prediction[SPORTS] Class[ARTS]
9. Prediction[SPORTS] Class[SPORTS]
10. Prediction[SPORTS] Class[SPORTS]


Our Classifier does a much better job!