Reading Data from file

In [1]:
import csv
import json
from nltk.tokenize import TweetTokenizer

def read_hate_tweets (annofile, jsonfile):
    """Reads in hate speech data."""
    all_data = {}
    annos = {}
    with open(annofile) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            if row[0] in annos:
                # if duplicate with different rating, remove!
                if row[1] != annos[row[0]]:
                    del(annos[row[0]])
            else:
                annos[row[0]] = row[1]

    tknzr = TweetTokenizer()
                
    with open(jsonfile) as jsonfile:
        for line in jsonfile:
            twtjson = json.loads(line)
            twt_id = twtjson['id_str']
            if twt_id in annos:
                all_data[twt_id] = {}
                all_data[twt_id]['offensive'] = "nonoffensive" if annos[twt_id] == 'none' else "offensive"
                all_data[twt_id]['text_tok'] = tknzr.tokenize(twtjson['text'])

    # split training and test data:
    all_data_sorted = sorted(all_data.items())
    items = [(i[1]['text_tok'],i[1]['offensive']) for i in all_data_sorted]
    splititem = len(all_data)-3250
    train_dt = items[:splititem]
    test_dt = items[splititem:]
    print('Training data:',len(train_dt))
    print('Test data:',len(test_dt))

    return(train_dt,test_dt)

TWEETS_ANNO = '../Data/NAACL_SRW_2016.csv'
TWEETS_TEXT = '../Data/NAACL_SRW_2016_tweets.json'

(train_data,test_data) = read_hate_tweets(TWEETS_ANNO,TWEETS_TEXT)


Training data: 12896
Test data: 3250


Naive Bayes Classifier 

In [10]:
class NaiveBayes(object):
    C=[]
    V=[]
    log_prior = {}
    log_likelihood = {}
    Bi = {}
    
    def __init__(self):
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
        pass

    def predict(self, x):
        
        """Predicts the class for a document.

        Args:
            x: A document, represented as a list of words.

        Returns:
            The predicted class, represented as a string.
        """
        sum_ = {}
        x = remove_unknown(self.V,x)
        x = [word.lower() for word in x]
        y=x 
        x.insert(0,"<s>")
        x.insert(len(x),"</s>")
        bigrams_in_x = [(x[i],x[i+1]) for i in range(len(x)-1)]

        for cl in self.C:
            tot = self.log_prior[cl]
            for b in bigrams_in_x:
                if b in list(self.Bi.keys()) :
                    tot= tot + self.log_likelihood[cl]['bigram'][b]
            for word in y:   
                if word in neg_vocab:
                    tot= tot + self.log_likelihood[cl]['neg_word'][word]   
                    
            sum_[cl] = tot
            tot = 0
        
        if(len(sum_)>0):
            return max(sum_.items(), key=operator.itemgetter(1))[0]
        else:
            return None
        
        
        
    @classmethod
    def train(cls, data, k=1):
        """Train a new classifier on training data using maximum
        likelihood estimation and additive smoothing.

        Args:
            cls: The Python class representing the classifier.
            data: Training data.
            k: The smoothing constant.

        Returns:
            A trained classifier, an instance of `cls`.
        """
        
        num_of_docs = len(data) # total number of tweets in our case
        cls.C = set([word[1] for word in data]) # set of classes
        cls.V = vocab(data) # set of unique words in training data
        cls.V = remove_stop_words(cls.V)
        cls.V = set([word.lower() for word in cls.V])
        words_in_class = {}
        bigram = bigrams(data,list(cls.C))
        (uniq_bigram,sorted_bigram) = bigrams_info(bigram,cls.C)
        cls.Bi = uniq_bigram
        for cl in cls.C:
            num_of_docs_in_c = count_for_class(data,cl)
            cls.log_prior[cl] = math.log(num_of_docs_in_c/num_of_docs)
            words_in_class[cl] = (vocab_for_class(data,cl))
            words_in_class[cl][0] = remove_stop_words(words_in_class[cl][0])
            words_in_class[cl][1] = remove_stop_words(list(words_in_class[cl][1]))
            words_in_class[cl][0] = [word.lower() for word in words_in_class[cl][0]]
            words_in_class[cl][1] = set([word.lower() for word in words_in_class[cl][1]])
            count = {}
            cls.log_likelihood[cl]={'bigram':{},'neg_word':{}}
            for b in list(uniq_bigram.keys()):
                count[b] = sorted_bigram[cl][b] if b in sorted_bigram[cl] else 0
                prefix = b[0]
                vc_class_doc = words_in_class[cl][0].count(prefix) + k*len(cls.V)
                cls.log_likelihood[cl]['bigram'][b] = math.log((count[b]+k)/(vc_class_doc))
            count = {}
            for word in neg_vocab:
                count[word] = words_in_class[cl][0].count(word)
                vc_class_doc = len(words_in_class[cl][0]) + k*len(cls.V)
                cls.log_likelihood[cl]['neg_word'][word] = math.log((count[word]+k)/(vc_class_doc))
        return cls()
    
    
    
    
def count_for_class(data,class_name):
    '''
    This function returns number of docs(tweets) with respect
    to a given input class.

    Parameters
    ----------
    data : List of docs with class
        DESCRIPTION.
    class_name : string.

    Returns
    -------
    count_of_class : int 

    '''
    count_of_class = 0
    for datum in data:
        if(datum[1] == class_name):
            count_of_class+=1
    return count_of_class



def vocab_for_class(data,class_name):
    '''
    This function creates vocabulary for a particular class and returns
    a list of words in a class and set of unique words in that class

    Parameters
    ----------
    data : List of docs with class
        DESCRIPTION.
    class_name : string.


    Returns
    -------
    list
        

    '''
    vocab = []
    for word in data:
        if(word[1] == class_name):
            vocab+=word[0]
    return [vocab,set(vocab)]


def vocab(data):
    '''
    This function creates vocabulary set for a given Document

    Parameters
    ----------
    data : list of documents with labelled classes

    Returns
    -------
    set of unique words in set of Documents

    '''
    vocab = []
    for word in data:
        vocab+=word[0]
    return set(vocab)

def remove_unknown(train_vocab,test_vocab):
    '''
    It removes the words which are present in testing set but not
    in training set.

    Parameters
    ----------
    train_vocab : vocabulary of training set
    test_vocab : vocabulary of testing set

    Returns
    -------
    test_vocab : modified testing vocabulary

    '''
    x = set(test_vocab)
    x = set(train_vocab).intersection(x)
    for word in test_vocab:
        if(word not in x):
            test_vocab = list(filter(lambda w: w != word, test_vocab))
    return test_vocab



Features

In [8]:
neg_vocab = ['died','aggressor','terrifying','Isis','hunting','Savage','Islamofascim','non-believers','genocidal','antisemitism'
            ,'communists','Insane','pedo','bitchy','harassed','dying','Islamofascist','wars','Destruction','smackem','fucktards',
            'monsters','gross','crucify', 'over-sensitive','Crucifixion','psychopaths','feminist','feminazi','HATE','Crimes','mocks',
            'ignorant','murders','hitting','rapists','#IDontNeedFeminism','frauds','punch','hoes','snobby','steal','stripper','kickass',
            'brutal','harassment','execute','Insult','sewers','behead', 'damn','#FuckOff','raped','#Islamists','asshole',  'rats',
             'violence','phony','chick', 'danger', 'stole', 'clowns', '#feminism', 'fools', 'Gags','Nazism','jihadis','atheism', 'morons',
            'filthy', 'rant','inferior', 'Pigs', 'annoying', 'burnt','monger','faggot','#Notsexist','STFU','#sexism','FUCKING','cocks',
           'grossly','tits', 'gruesome','butchers', 'terrorism', 'Trashy','hateful', 'useless', 'semen', 'dumbass', 'garbage','Hatemongers',
            'punching','kills','cum','Satan','boobs','sexual','Terrorists','femininity','Jihad','#Bitches', 'gays','hater','arrogant',
            'egomaniac','fat','fatty', 'hatered','Nazi','trashy','rapist', 'fuckin','fascist', 'Idiots','jihadists','grabbing', 'stalk', 'HOT',
           'raping', 'faggots','dickless','murderer','sucking','fucker','horrible', 'shameful','fuck','Stalked','sassy', 'mantears', 'Islamofasicsm',
             'maniacs','Sewer','Feminazi','nude','weaker','fucking','swine', 'shits','arse','BITCHES', 'Bitch', 'dickweed', 'pedophelia',
            'cocksucker','fetish','sucks','chicks','DUMB','nigger','#YouSuck','SHUTUP','shiting','horny', 'nigga','dick','slaves','Islamolunatic',
            '#IslamLOVESWomen','killer','butts','Hoes','crap', 'penis','hating','vaginas','stab','hatred','pedophile','bigotry'
            ,'barbarity','scum', '#killerblondes']
neg_vocab = set([word.lower() for word in neg_vocab])


stop_words = ["a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would"]
def remove_stop_words(wordlist):
    '''
    It removes stop words like the,is,could,was,....

    Parameters
    ----------
    wordlist : list of words

    Returns
    -------
    vocab : modified list of words

    '''
    vocab = [word for word in wordlist if word not in stop_words]
    return vocab



def bigrams(data,cl):
    '''
    This function creates bigram for with respect to their class.
    It returns a dictionary which contain bigram list for all the classes in data set
    e.g in our case {"offensive":[],"nonoffensive"[]}
    '''
    
    bigrams_in_corpus={}
    for c in cl:
        bigrams_in_corpus[c] = []
        for sent in data:
            if(sent[1]==c):
                sent[0].insert(0,"<s>")
                sent[0].insert(len(sent[0]),"</s>")
                bigrams_in_corpus[c]+= [(sent[0][i].lower(),sent[0][i+1].lower()) for i in range(len(sent[0])-1)]
        
    return bigrams_in_corpus

def bigrams_info(bigrams,cl):
    '''
    This function calculate some information about bigram. 
    1. It creates unique set of bigrams for each class, here  uniq_bi
    2. It combines all unique bigrams into one variabe, uniq
    3. It computes frequency of each bigram in its respective class, bi_freq
    4. It combines frequency list from each class into one variable, uniq_freq
    and finally returns uniq_freq and sorted_freq
    
    '''
    uniq = []
    uniq_bi = {}
    bi_freq={}
    uniq_freq={}
    sorted_freq={}
    for c in cl:
        uniq_bi[c] = list(set(bigrams[c]))
        uniq+=uniq_bi[c]
        bi_freq[c] = {bi : bigrams[c].count(bi) for bi in uniq_bi[c]}
        sorted_freq[c] = {word : value for word,value in sorted(bi_freq[c].items(),key=lambda item : item[1],reverse=True)}
    
    uniq = list(set(uniq))
    uniq_freq = {bi:0 for bi in uniq}
    for c in cl:
        for bi,value in sorted_freq[c].items():
            uniq_freq[bi]+=value
    return (uniq_freq,sorted_freq)


Evalutation

In [11]:
import csv,math,string
import json,operator,random  
def accuracy(classifier, data):
    """Computes the accuracy of a classifier on reference data.


    Args:
        classifier: A classifier.
        data: Reference data.

    Returns:
        The accuracy of the classifier on the test data, a float.
    """
    
    for d in data:
        tweet = d[0]
        actual = d[1]
        predicted = classifier.predict(tweet)
        if ( predicted is None):
            continue 
           
        # Some document(tweets) might be empty after removing unknown and stop words, in that case predict() returns None
        # and continue to next document(tweet)
        
        if(actual == predicted):
            if(predicted == 'offensive'):
                classifier.tp+=1
            else:
                classifier.tn+=1
        else:
            if(predicted == 'offensive'):
                classifier.fp+=1
            else:
                classifier.fn+=1
    return (classifier.tp+classifier.tn)/(classifier.tp+classifier.tn+classifier.fn+classifier.fp)

def f_1(classifier, data):
    """
     Computes the F_1-score of a classifier on reference data.
    

    Args:
        classifier: A classifier.
        data: Reference data.

    Returns:
        The F_1-score of the classifier on the test data, a float.
    """
    
   
   
    precision = classifier.tp/(classifier.tp+classifier.fp)
    recall  = classifier.tp/(classifier.tp+classifier.fn)
    f1 = (2*precision*recall)/(precision+recall)
    return f1

In [12]:
nb = NaiveBayes.train(train_data,0.1)
print("Accuracy: ",accuracy(nb, test_data))
print("F_1: ", f_1(nb,test_data))

Accuracy:  0.8018461538461539
F_1:  0.30303030303030304
