Reading Data from file

In [1]:
import csv
import json
from nltk.tokenize import TweetTokenizer

def read_hate_tweets (annofile, jsonfile):
    """Reads in hate speech data."""
    all_data = {}
    annos = {}
    with open(annofile) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            if row[0] in annos:
                # if duplicate with different rating, remove!
                if row[1] != annos[row[0]]:
                    del(annos[row[0]])
            else:
                annos[row[0]] = row[1]

    tknzr = TweetTokenizer()
                
    with open(jsonfile) as jsonfile:
        for line in jsonfile:
            twtjson = json.loads(line)
            twt_id = twtjson['id_str']
            if twt_id in annos:
                all_data[twt_id] = {}
                all_data[twt_id]['offensive'] = "nonoffensive" if annos[twt_id] == 'none' else "offensive"
                all_data[twt_id]['text_tok'] = tknzr.tokenize(twtjson['text'])

    # split training and test data:
    all_data_sorted = sorted(all_data.items())
    items = [(i[1]['text_tok'],i[1]['offensive']) for i in all_data_sorted]
    splititem = len(all_data)-3250
    train_dt = items[:splititem]
    test_dt = items[splititem:]
    print('Training data:',len(train_dt))
    print('Test data:',len(test_dt))

    return(train_dt,test_dt)

TWEETS_ANNO = '../Data/NAACL_SRW_2016.csv'
TWEETS_TEXT = '../Data/NAACL_SRW_2016_tweets.json'

(train_data,test_data) = read_hate_tweets(TWEETS_ANNO,TWEETS_TEXT)


Training data: 12896
Test data: 3250


In [None]:
class NaiveBayes(object):
    C=[]
    V=[]
    log_prior = {}
    log_likelihood = {}
    Bi = {}
    
    def __init__(self):
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
        pass

    def predict(self, x):
        
        """Predicts the class for a document.

        Args:
            x: A document, represented as a list of words.

        Returns:
            The predicted class, represented as a string.
        """
        sum_ = {}
        x = remove_unknown(self.V,x)
        x = [word.lower() for word in x]
        y=x 
        x.insert(0,"<s>")
        x.insert(len(x),"</s>")
        bigrams_in_x = [(x[i],x[i+1]) for i in range(len(x)-1)]

        for cl in self.C:
            tot = self.log_prior[cl]
            for b in bigrams_in_x:
                if b in list(self.Bi.keys()) :
                    tot= tot + self.log_likelihood[cl]['bigram'][b]
            for word in y:   
                if word in neg_vocab:
                    tot= tot + self.log_likelihood[cl]['neg_word'][word]   
                    
            sum_[cl] = tot
            tot = 0
        
        if(len(sum_)>0):
            return max(sum_.items(), key=operator.itemgetter(1))[0]
        else:
            return None
        
        
        
    @classmethod
    def train(cls, data, k=1):
        """Train a new classifier on training data using maximum
        likelihood estimation and additive smoothing.

        Args:
            cls: The Python class representing the classifier.
            data: Training data.
            k: The smoothing constant.

        Returns:
            A trained classifier, an instance of `cls`.
        """
        
        num_of_docs = len(data) # total number of tweets in our case
        cls.C = set([word[1] for word in data]) # set of classes
        cls.V = vocab(data) # set of unique words in training data
        cls.V = remove_stop_words(cls.V)
        cls.V = set([word.lower() for word in cls.V])
        words_in_class = {}
        bigram = bigrams(data,list(cls.C))
        (uniq_bigram,sorted_bigram) = bigrams_info(bigram,cls.C)
        cls.Bi = uniq_bigram
        for cl in cls.C:
            num_of_docs_in_c = count_for_class(data,cl)
            cls.log_prior[cl] = math.log(num_of_docs_in_c/num_of_docs)
            words_in_class[cl] = (vocab_for_class(data,cl))
            words_in_class[cl][0] = remove_stop_words(words_in_class[cl][0])
            words_in_class[cl][1] = remove_stop_words(list(words_in_class[cl][1]))
            words_in_class[cl][0] = [word.lower() for word in words_in_class[cl][0]]
            words_in_class[cl][1] = set([word.lower() for word in words_in_class[cl][1]])
            count = {}
            cls.log_likelihood[cl]={'bigram':{},'neg_word':{}}
            for b in list(uniq_bigram.keys()):
                count[b] = sorted_bigram[cl][b] if b in sorted_bigram[cl] else 0
                prefix = b[0]
                vc_class_doc = words_in_class[cl][0].count(prefix) + k*len(cls.V)
                cls.log_likelihood[cl]['bigram'][b] = math.log((count[b]+k)/(vc_class_doc))
            count = {}
            for word in neg_vocab:
                count[word] = words_in_class[cl][0].count(word)
                vc_class_doc = len(words_in_class[cl][1]) + k*len(cls.V)
                cls.log_likelihood[cl]['neg_word'][word] = math.log((count[word]+k)/(vc_class_doc))
        return cls()