Reading data from file

In [36]:
import csv
import json
from nltk.tokenize import TweetTokenizer

def read_hate_tweets (annofile, jsonfile):
    """Reads in hate speech data."""
    all_data = {}
    annos = {}
    with open(annofile) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            if row[0] in annos:
                # if duplicate with different rating, remove!
                if row[1] != annos[row[0]]:
                    del(annos[row[0]])
            else:
                annos[row[0]] = row[1]

    tknzr = TweetTokenizer()
                
    with open(jsonfile) as jsonfile:
        for line in jsonfile:
            twtjson = json.loads(line)
            twt_id = twtjson['id_str']
            if twt_id in annos:
                all_data[twt_id] = {}
                all_data[twt_id]['offensive'] = "nonoffensive" if annos[twt_id] == 'none' else "offensive"
                all_data[twt_id]['text_tok'] = tknzr.tokenize(twtjson['text'])

    # split training and test data:
    all_data_sorted = sorted(all_data.items())
    items = [(i[1]['text_tok'],i[1]['offensive']) for i in all_data_sorted]
    splititem = len(all_data)-3250
    train_dt = items[:splititem]
    test_dt = items[splititem:]
    print('Training data:',len(train_dt))
    print('Test data:',len(test_dt))

    return(train_dt,test_dt)

TWEETS_ANNO = '../Data/NAACL_SRW_2016.csv'
TWEETS_TEXT = '../Data/NAACL_SRW_2016_tweets.json'

(train_data,test_data) = read_hate_tweets(TWEETS_ANNO,TWEETS_TEXT)


Training data: 12896
Test data: 3250


In [37]:
import numpy as np

def build_w2i(data):
    '''
    This function creates a vector of unique words in dataset excluding some stop words.
    
    '''
    vocab = []
    for word in data:
        vocab+=word[0]
    return set(vocab)
def featurize(data):
    
    '''
    This function creates a matrix (X) with rows representing each data instance and columns representing features.
    Features in this case is Vocabulary set. It assigns value 1 if word is present in data instance (tweet) else
    it assigns 0. 
    
    It creates another matrix (Y) with rows representing number of data instances and column represented by number of classes.
    
    '''
    
    X = np.array([[0 for j in range(len(vocab))],]*len(data))
    i=0
    for obj in data:
        tweet = [word.lower() for word in list(obj[0])]
        j=0
        for word in vocab:
            if(word in tweet):
                X[i][j] = 1
            else:
                X[i][j] = 0
            j+=1
        i+=1  
    Y = np.array([[0,1],]*len(data))
    for i in range(len(data)):
        if(data[i][1]=='offensive'):
            Y[i] = [1,0]
    return (X,Y)


In [38]:
vocab = list(build_w2i(train_data))
(x,y) = featurize(train_data)


In [106]:
import numpy as np

class LogReg:

    num_class = 0
    def __init__(self, eta=0.01, num_iter=30):
        self.eta = eta
        self.num_iter = num_iter
        
    def softmax(self,x):
        '''
        This function computes softmax value for an array or a matrix

        Input : x is a array or a matrix

        Output : result is either a array of softmax value or a matrix
        '''
        x=x.astype(float)
        if x.ndim==1:
            return np.exp(x)/np.sum(np.exp(x))
        elif x.ndim==2:
            result=np.zeros_like(x)
            M,N=x.shape
            z = x - np.max(x, axis=0, keepdims=True)
            for n in range(N):
                S=np.sum(np.exp(z[:,n]))
                result[:,n]=np.exp(z[:,n])/S
            return result
        else:
            print("The input array is not 1- or 2-dimensional.")
    
    
    def gradient(self,y_pred,Y,X):
        
        '''
        This function computes gradient using calculated output (y_), actual output(y) and input matrix (x)
        x is matrix which contains data instance wrt their weights.
        
        "grad" variable is a matrix with rows representing number of classes (2) and column representing 
         features(words in Vocabulary).
        
        It returns improved weights and bias for each features wrt their classes.
        
        '''
        temp = np.subtract(y_pred,Y)
        weight = np.matmul(temp,X)/len(X)
        bias = np.matmul(temp,np.ones((X.shape)))/len(X)
        return (weight,bias)
    
    def cost(self,y_pred,Y):
        error = np.sum(-np.multiply(Y,np.log(y_pred,where= y_pred !=0)))
        return error
        
    
    
    def train(self, X, Y):
        '''
        This function trains given dataset by iteratively updating their weights and bias.
        weights is matrix of size (num of classes X num of words in Vocabulary)
        bias is matrix of size (num of classes X 1)
        
        We divide dataset into mini-batches of 100 data instances and update weights and bias at end of each mini batch.
        We repeat this process num_iter times
        
        '''
        # weights initialization
        self.num_class = Y.shape[1]
        self.weights = np.zeros((Y.shape[1],X.shape[1]))
        self.bias = np.zeros((Y.shape[1],X.shape[1]))
        for i in range(self.num_iter):
            ind = np.arange(len(X))
            np.random.shuffle(ind)
            b=np.arange(0,len(X),100)
            np.append(b,len(X))
            loss = 0
            j=0
            for j in range(len(b)-1):
                x = X[ind[b[j]:b[j+1]]]
                y = Y[ind[b[j]:b[j+1]]]
                prob = np.zeros((2,len(x)))
                prob = self.p(x)
                error = self.cost(prob,y.T)
                loss = loss + error/len(x)
                change_in_w, change_in_b = self.gradient(prob,y.T,x)
                self.weights = self.weights - self.eta*change_in_w
                self.bias = self.bias - self.eta*change_in_b
                
            print("Loss : ",loss)
        return None
    
    
    def p(self, X): 
        '''
        This function cpmputes probability for each data instance wrt each class.
        
        "prob" is a matrix of size ( num of data instances X num of classes )
        For our particular case first case represent class "nonoffensive" and second column represent "offensive"
        i.e prob[i][0] is probability for y=0 and prob[i][1] is probability for y=1 where y is actual output and
        offensive means 1.
        
        '''
        temp = np.add(np.matmul(self.weights,X.T),np.matmul(self.bias,np.ones((X.T.shape)))/X.shape[1])
        prob = np.zeros((self.weights.shape))
        prob = self.softmax(temp) 
        return prob
    
    def predict(self, X):
        '''
        This function predicts the class to which given data instance belongs to.
        We compare the calculated probability of given instance among all classes.
        Class with max probability is the output.
        
        It also calculates F_1 score.
        '''
        (x_test,y_test) = featurize(X)
        prob=self.p(x_test)
        result=np.argmax(prob,axis=0)
        tp=0
        tn=0
        fn=0
        fp=0
        y=y_test.T
        print(y.shape)
        print(y_test.shape)
        for i in range(len(y_test)):
            if(y[0][i]==result[i]):
                if(result[i]==1):
                    tp+=1
                else:
                    tn+=1
            else:
                if(result[i]==1):
                    fp+=1
                else:
                    fn+=1
        if(tp+fp!=0):
            pr = tp/(tp+fp)
        if(tp+fn!=0):
            r = tp/(tp+fn)
        if(pr+r!=0):
            f1 = 2*pr*r/(pr+r)
        else:
            f1=0
        #acc = (tp+tn)/(tp+tn+fn+fp)
        print("F_1 score is: "+str(f1))
        
        return None

In [110]:
lg=LogReg(0.1,70)
lg.train(x,y)
lg.predict(test_data)

Loss :  75.49172826982984
Loss :  66.82921192987259
Loss :  62.85509751221519
Loss :  60.30692195601471
Loss :  58.42204684456027
Loss :  56.894831460004376
Loss :  55.777390703681306
Loss :  54.597680608866014
Loss :  53.73527502741612
Loss :  52.96797730627935
Loss :  52.158206120374516
Loss :  51.55070595302376
Loss :  50.90302001996976
Loss :  50.35064738803884
Loss :  49.87339678106835
Loss :  49.34264864902467
Loss :  48.88089718262399
Loss :  48.564671761921836
Loss :  48.12753711932712
Loss :  47.727955881343036
Loss :  47.37384158981557
Loss :  47.02218262473904
Loss :  46.652321103569705
Loss :  46.463455715470495
Loss :  46.12508138279073
Loss :  45.832387306386416
Loss :  45.45410039450242
Loss :  45.264218784114505
Loss :  45.04996983120211
Loss :  44.740879514888015
Loss :  44.544603674867744
Loss :  44.282984507927395
Loss :  43.99017382831343
Loss :  43.84317535067141
Loss :  43.60197370043039
Loss :  43.43148872374808
Loss :  43.223792087079175
Loss :  42.9854179036825