Reading data from file

In [1]:
import csv
import json
from nltk.tokenize import TweetTokenizer

def read_hate_tweets (annofile, jsonfile):
    """Reads in hate speech data."""
    all_data = {}
    annos = {}
    with open(annofile) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            if row[0] in annos:
                # if duplicate with different rating, remove!
                if row[1] != annos[row[0]]:
                    del(annos[row[0]])
            else:
                annos[row[0]] = row[1]

    tknzr = TweetTokenizer()
                
    with open(jsonfile) as jsonfile:
        for line in jsonfile:
            twtjson = json.loads(line)
            twt_id = twtjson['id_str']
            if twt_id in annos:
                all_data[twt_id] = {}
                all_data[twt_id]['offensive'] = "nonoffensive" if annos[twt_id] == 'none' else "offensive"
                all_data[twt_id]['text_tok'] = tknzr.tokenize(twtjson['text'])

    # split training and test data:
    all_data_sorted = sorted(all_data.items())
    items = [(i[1]['text_tok'],i[1]['offensive']) for i in all_data_sorted]
    splititem = len(all_data)-3250
    train_dt = items[:splititem]
    test_dt = items[splititem:]
    print('Training data:',len(train_dt))
    print('Test data:',len(test_dt))

    return(train_dt,test_dt)

TWEETS_ANNO = '../Data/NAACL_SRW_2016.csv'
TWEETS_TEXT = '../Data/NAACL_SRW_2016_tweets.json'

(train_data,test_data) = read_hate_tweets(TWEETS_ANNO,TWEETS_TEXT)


Training data: 12896
Test data: 3250


In [2]:
import numpy as np

def build_w2i(data):
    '''
    This function creates a vector of unique words in dataset excluding some stop words.
    
    '''
    vocab = []
    for word in data:
        vocab+=word[0]
    return set(vocab)
def featurize(data):
    
    '''
    This function creates a matrix (X) with rows representing each data instance and columns representing features.
    Features in this case is Vocabulary set. It assigns value 1 if word is present in data instance (tweet) else
    it assigns 0. 
    
    It creates another matrix (Y) with rows representing number of data instances and column represented by number of classes.
    
    '''
    
    X = np.array([[0 for j in range(len(vocab))],]*len(data))
    i=0
    for obj in data:
        tweet = [word.lower() for word in list(obj[0])]
        j=0
        for word in vocab:
            if(word in tweet):
                X[i][j] = 1
            else:
                X[i][j] = 0
            j+=1
        i+=1  
    Y = np.array([[0,1],]*len(data))
    for i in range(len(data)):
        if(data[i][1]=='offensive'):
            Y[i] = [1,0]
    return (X,Y)


In [3]:
vocab = list(build_w2i(train_data))
(x,y) = featurize(train_data)


In [4]:
print(x.shape,y.shape)

(12896, 25892) (12896, 2)


In [29]:
import numpy as np

class LogReg:

    num_class = 0
    def __init__(self, eta=0.01, num_iter=30 , alpha = 0.1):
        self.eta = eta
        self.num_iter = num_iter
        self.alpha = alpha
        
    def softmax(self,x):
        '''
        This function computes softmax value for an array or a matrix

        Input : x is a array or a matrix

        Output : result is either a array of softmax value or a matrix
        '''
        x=x.astype(float)
        if x.ndim==1:
            return np.exp(x)/np.sum(np.exp(x))
        elif x.ndim==2:
            result=np.zeros_like(x)
            M,N=x.shape
            z = x - np.max(x, axis=0, keepdims=True)
            for n in range(N):
                S=np.sum(np.exp(z[:,n]))
                result[:,n]=np.exp(z[:,n])/S
            return result
        else:
            print("The input array is not 1- or 2-dimensional.")
    
    
    def gradient(self,y_pred,Y,X):
        
        '''
        This function computes gradient using calculated output (y_), actual output(y) and input matrix (x)
        x is matrix which contains data instance wrt their weights.
        
        
        It returns improved weights and bias for each features wrt their classes.
        
        '''
        temp = np.subtract(y_pred,Y)
        weight = np.matmul(temp,X)
        bias = np.matmul(temp,np.ones((X.shape)))
        return (weight,bias)
    
    def cost(self,y_pred,Y):
        error = np.sum(-np.multiply(Y,np.log(y_pred,where= y_pred !=0)))
        return error
        
    
    
    def train(self, X, Y):
        '''
        This function trains given dataset by iteratively updating their weights and bias.
        weights is matrix of size (num of classes X num of words in Vocabulary)
        bias is matrix of size (num of classes X 1)
        
        We divide dataset into mini-batches of 100 data instances and update weights and bias at end of each mini batch.
        We repeat this process num_iter times
        
        '''
        # weights initialization
        self.num_class = Y.shape[1]
        self.weights = np.zeros((Y.shape[1],X.shape[1]))
        self.bias = np.zeros((Y.shape[1],X.shape[1]))
        for i in range(self.num_iter):
            ind = np.arange(len(X))
            np.random.shuffle(ind)
            b=np.arange(0,len(X),100)
            np.append(b,len(X))
            loss = 0
            j=0
            for j in range(len(b)-1):
                x = X[ind[b[j]:b[j+1]]]
                y = Y[ind[b[j]:b[j+1]]]
                prob = np.zeros((2,len(x)))
                prob = self.p(x)
                error = self.cost(prob,y.T)
                loss = loss + error/len(x)
                change_in_w, change_in_b = self.gradient(prob,y.T,x)
                #change_in_w = change_in_w/len(b)
                change_in_b = change_in_b/len(b)
                self.weights = self.weights - self.eta*(change_in_w-
                                                        self.alpha*np.square(self.weights - np.max(self.weights, axis=0, keepdims=True)))
                self.bias = self.bias - self.eta*(change_in_b-
                                                  self.alpha*np.square(self.bias - np.max(self.bias, axis=0, keepdims=True)))
                
            print("Loss : ",loss)
        return None
    
    
    def p(self, X): 
        '''
        This function cpmputes probability for each data instance wrt each class.
        
        '''
        temp = np.add(np.matmul(self.weights,X.T),np.matmul(self.bias,np.ones((X.T.shape)))/X.shape[1])
        prob = self.softmax(temp) 
        return prob
    
    def predict(self, X):
        '''
        This function predicts the class to which given data instance belongs to.
        We compare the calculated probability of given instance among all classes.
        Class with max probability is the output.
        
        It also calculates F_1 score.
        '''
        (x_test,y_test) = featurize(X)
        prob=self.p(x_test)
        result=np.argmax(prob,axis=0)
        tp=0
        tn=0
        fn=0
        fp=0
        y=y_test.T
        print(result)
        print(y_test.shape)
        for i in range(len(y_test)):
            if(y[0][i]!=result[i]):
                if(result[i]==0):
                    tp+=1
                else:
                    tn+=1
            else:
                if(result[i]==0):
                    fp+=1
                else:
                    fn+=1
        if(tp+fp!=0):
            pr = tp/(tp+fp)
        if(tp+fn!=0):
            r = tp/(tp+fn)
        if(pr+r!=0):
            f1 = 2*pr*r/(pr+r)
        else:
            f1=0
        acc = (tp+tn)/(tp+tn+fn+fp)
        print("F_1 score is: ",f1)
        print('Accuracy : ',acc)
        
        return None

Without Regularization

In [28]:
lg=LogReg(0.01,num_iter=30)
lg.train(x,y)
lg.predict(test_data)

Loss :  65.53085603966362
Loss :  55.027926569464036
Loss :  50.806736504646906
Loss :  47.88826273968885
Loss :  45.613139790623244
Loss :  43.79848537902119
Loss :  42.24806780312386
Loss :  40.875265538162125
Loss :  39.59293366225303
Loss :  38.59692872093079
Loss :  37.64610807106132
Loss :  36.75380627732457
Loss :  35.891603796436065
Loss :  35.24308269285592
Loss :  34.41081762647342
Loss :  33.711297846487064
Loss :  33.114410161474524
Loss :  32.62912841526994
Loss :  32.14910273364636
Loss :  31.653977789983184
Loss :  31.112044802944244
Loss :  30.55664039602641
Loss :  30.126394200206782
Loss :  29.736913833353178
Loss :  29.31454069867564
Loss :  29.017656499239067
Loss :  28.55701329309474
Loss :  28.17484257554147
Loss :  27.768921871796742
Loss :  27.480284321692736
[1 1 1 ... 1 1 0]
(3250, 2)
F_1 score is:  0.5541195476575121
Accuracy :  0.8301538461538461


With Regularization

In [None]:
lg=LogReg(0.01,num_iter=30,alpha=0.1)
lg.train(x,y)
lg.predict(test_data)

Loss :  65.70102433534275
Loss :  55.461229334512694
Loss :  51.44207179877521
Loss :  48.68328120674818
Loss :  46.69114345609151
Loss :  45.14492824450215
Loss :  44.07408471525496
Loss :  43.08937074834608
Loss :  42.120037123002405
Loss :  41.36572991573463
Loss :  40.73139790778097
Loss :  40.177579141927346
Loss :  39.83388653715795
Loss :  39.3436698737661
Loss :  38.979776546764214
Loss :  38.56366541295866
Loss :  38.26847329120319
Loss :  38.0301649319871
Loss :  37.80359209607566
Loss :  37.487245607885974
Loss :  37.36177972986984
Loss :  37.16145238260284
Loss :  36.991502353890475
Loss :  36.81996334526019
Loss :  36.64317005422535
Loss :  36.55198824161171
Loss :  36.45194666292349
Loss :  36.37616381047975
Loss :  36.23727884664787
Loss :  36.20353963223829
