In [1]:
import nltk 
import pandas as pd 
from nltk import ngrams
import numpy as np

In [2]:
X_train_1 = pd.read_csv('../Data/data_1/Training_data/X.csv').iloc[:,1:]
X_test_1  = pd.read_csv('../Data/data_1/Testing_data/X.csv').iloc[:,1:]
y_train_1 = pd.read_csv('../Data/data_1/Training_data/Y.csv').iloc[:,1:]
y_test_1  = pd.read_csv('../Data/data_1/Testing_data/Y.csv').iloc[:,1:]

In [3]:
X_train_2 = pd.read_csv('../Data/data_2/Training_data/X.csv').iloc[:,1:]
X_test_2  = pd.read_csv('../Data/data_2/Testing_data/X.csv').iloc[:,1:]
y_train_2 = pd.read_csv('../Data/data_2/Training_data/Y.csv').iloc[:,1:]
y_test_2  = pd.read_csv('../Data/data_2/Testing_data/Y.csv').iloc[:,1:]

In [4]:
def count_comments(result, comments, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        comments: a list of comments
        ys: a list corresponding to the class of each comment ( 'p' if offensive 'n' otherwise )
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    
    for y, comment in zip(ys, comments):
        for n in [1,2,3,4]:
            commentNgrams = ngrams(comment.split(), n)
            for grams in commentNgrams:
                # define the key, which is the word and label tuple
                pair = (' '.join(grams) , y)

                # if the key exists in the dictionary, increment the count
                if pair in result:
                    result[pair] += 1

                # else, if the key is new, add it to the dictionary and set the count to 1
                else:
                    result[pair] = 1

    return result

In [5]:
freq_1 = count_comments({},X_train_1["text"],y_train_1['offensive/non offensive'])

freq_2 = count_comments({},X_train_2["text"],y_train_2['offensive/non offensive'])

In [6]:
print(len(freq_1))

86662


In [7]:
print(len(freq_2))

225672


In [8]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (ngram, label) to how often the ngram appears
        train_x: a list of comments
        train_y: a list of labels correponding to the comments ('p','n')
    Output:
        logprior: the log prior.
        loglikelihood: the log likelihood of Naive bayes equation.
    '''
    loglikelihood = {}
    logprior = 0

 
    # calculate V, the number of unique ngrams in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_offensive and N_non_offensive
    N_offensive = N_non_offensive = 0
    for pair in freqs.keys():
        # if the comment is offensive (p)
        if pair[1] == "p":

            # Increment the number of offensive ngrams by the count for this (ngram, label) pair
            N_offensive += freqs.get(pair,0)

        # else, the comment is non offensive
        else:

            # increment the number of non offensive ngrams by the count for this (ngram,label) pair
            N_non_offensive += freqs.get(pair,0)

    # Calculate C, the number of comments
    C = len(train_y)

    # Calculate C_offensive, the number of offensive comments
    C_offensive = 0
    for index in range(len(train_y)) :
        if train_y.iloc[index]['offensive/non offensive'] == "p":
            C_offensive += 1

    # Calculate C_non_offensive, the number of non offensive comments
    C_non_offensive = C - C_offensive

    # Calculate logprior
    logprior = np.log(C_offensive/C_non_offensive)

    # For each word in the vocabulary...
    for word in vocab:
        # get the offensive and non offensive frequency of the word
        freq_offensive = freqs.get((word,"p"),0)
        freq_non_offensive = freqs.get((word,"n"),0)

        # calculate the probability that each word is offensive, and non offensive
        p_w_offensive = (freq_offensive + 1 ) / (C_offensive + V)
        p_w_non_offensive = (freq_non_offensive + 1 ) / (C_non_offensive + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_offensive) - np.log(p_w_non_offensive)


    return logprior, loglikelihood


In [9]:
logprior_1, loglikelihood_1 = train_naive_bayes(freq_1, X_train_1, y_train_1)
print(f'logprior_1 : {logprior_1}')
print(f'loglikelihood_1 : {len(loglikelihood_1)}')

logprior_1 : 0.23677167881556502
loglikelihood_1 : 84159


In [10]:
logprior_2, loglikelihood_2 = train_naive_bayes(freq_2, X_train_2, y_train_2)
print(f'logprior_2 : {logprior_2}')
print(f'loglikelihood_2 : {len(loglikelihood_2)}')

logprior_2 : 0.29254771147111763
loglikelihood_2 : 218517


In [11]:
def naive_bayes_predict(comment, logprior, loglikelihood):
    '''
    Input:
        comment: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the comment (if found in the dictionary) + logprior (a number)

    '''
    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior 

    for n in [1,2,3,4]:
        commentNgrams = ngrams(comment.split(), n)
        for grams in commentNgrams:
            word = ' '.join(grams)
            # check if the word exists in the loglikelihood dictionary
            if word in loglikelihood:
                # add the log likelihood of that word to the probability
                p += loglikelihood.get(word)

    return p


In [12]:
my_comment = "كلب"
p = naive_bayes_predict(my_comment, logprior_1, loglikelihood_1)
print('The expected output is', p)

The expected output is 1.6200513572142514


In [13]:
my_comment = "حنونتي"
p = naive_bayes_predict(my_comment, logprior_2, loglikelihood_2)
print('The expected output is', p)

The expected output is -1.0971343292547198


In [14]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of comments
        test_y: the corresponding labels for the list of comments
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of comments classified correctly)/(total # of comments)
    """
    accuracy = 0  # return this properly

    y_hats = []
    for index in range(len(test_x)):
        comment = test_x.iloc[index]["text"]
        
        for index in range(len(test_y)):
            if test_y.iloc[index]['offensive/non offensive'] == "n":
                test_y.iloc[index]['offensive/non offensive'] = 0
            else :
                test_y.iloc[index]['offensive/non offensive'] = 1
        
        # if the prediction is > 0
        if naive_bayes_predict(comment, logprior, loglikelihood) > 0:
            # the predicted class is 1 "Offensive"
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0 "Non Offensive"
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats - test_y['offensive/non offensive']))

    # Accuracy is 1 minus the error
    accuracy = 1- error

    return accuracy


In [15]:
print("Data 1 : ")
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(X_test_1, y_test_1, logprior_1, loglikelihood_1)))

Data 1 : 
Naive Bayes accuracy = 0.6503


In [16]:
print("Data 2 : ")
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(X_test_2, y_test_2, logprior_2, loglikelihood_2)))

Data 2 : 
Naive Bayes accuracy = 0.8112
