In [None]:
#imports
import stanfordnlp
import pandas as pd
import numpy as np
import re

from os import getcwd
from os import environ
from random import sample 

import Levenshtein as lev
from stanfordnlp.server import CoreNLPClient

#set environment variable
cwd = getcwd()
environ['CORENLP_HOME'] = cwd + '\\corenlp\\stanford-corenlp-full-2018-10-05'

In [None]:
#this import is seperate in case you are running the client without using a gpu
import torch

In [None]:
#define custom functions for convenience and saving lines of code

#add final results of algorithm to results dataframe
def addResults(results, rlabel, rmessageOG, rcleanMessage, rWarning, rRatio, rClass): 
    results.loc[-1] = [rlabel, rmessageOG, rcleanMessage, rWarning, rRatio, rClass]
    results.index = results.index+1
    results.reindex(index=results.index[::-1])

#add detected taged words
def addTags(tags, tsentence, ttoken, tvalue, toriginal): 
    tags.loc[-1] = [tsentence, ttoken, tvalue, toriginal]
    tags.index = tags.index+1
    tags.reindex(index=tags.index[::-1])

#add detected dependencies from detected words 
def addFlags(flags, fsentence, fedge, fsources, fsource_words, ftargets, ftarget_words, fdependencies, forigin):
    #for forigin the opposite of what triggered the detection is what will be placed there for amplifier detection 
    #convenience later (for example if the target is nigger, the forigin will be 'source words' to check for amplifiers)
    flags.loc[-1] = [fsentence, fedge, fsources, fsource_words, ftargets, ftarget_words, fdependencies, forigin]
    flags.index = flags.index + 1
    flags = flags.sort_index(inplace=True)

#add info for detected words from Hatebase data
def addInfo(termInfo, ambiguousHateBaseDB, unambiguousHateBaseDB, amOffAverage, unOffAverage, element):    
    termInfo = termInfo.append(ambiguousHateBaseDB.loc[ambiguousHateBaseDB['term'].str.lower()\
                                                       == element.lower()], ignore_index=True) 
    termInfo['average_offensiveness'].fillna(amOffAverage, inplace=True)
    termInfo = termInfo.append(unambiguousHateBaseDB.loc[unambiguousHateBaseDB['term'].str.lower()\
                                                       == element.lower()], ignore_index=True) 
    termInfo['average_offensiveness'].fillna(unOffAverage, inplace=True)
    termInfo['hateful_meaning'] = termInfo['hateful_meaning'].str.lower() 
    return termInfo

In [None]:
#read in Hatebase data, divided on ambiguity 
ambiguousHateBaseDB = pd.read_csv("total_ambiguous_results.csv", index_col=False)
unambiguousHateBaseDB = pd.read_csv('total_unambiguous_results.csv', index_col=False)
#read in amplifiers list
amplifiersDB = pd.read_csv('noswearing_trim_data.csv', index_col=False) 

#form lists from above data for convenience 
amplifiersList = amplifiersDB['term']
ambiguousHateTermsList = ambiguousHateBaseDB['term']
unambiguousHateTermsList = unambiguousHateBaseDB['term']

#calcualte averages from Hatebase lists
unOffAverage = unambiguousHateBaseDB["average_offensiveness"].mean()
amOffAverage = ambiguousHateBaseDB['average_offensiveness'].mean()

In [None]:
#read in tweet data
fullMessagesDB = pd.read_csv('davidson_labeled_data_replaced.csv')
hateMessagesDB = pd.read_csv("DavidsonHateOnly.csv", index_col=False)
cleanMessagesDB = pd.read_csv("DavidsonCleanOnly.csv", index_col=False)

In [None]:
#create dataframe to store final results of client
results = pd.DataFrame(columns = ['label', 'message', 'clean', 'warning', 'hate_ratio', 'classOG'])

## Run Client

In [None]:
currentDB = fullMessagesDB #used so you only need to edit here to change tweet db being used

#set global variables
cutoff = .95 #jaro similarity cutoff
offCutoff = 90 #offensive cutoff, 75
offCutoff2 = 270 #second offensive cutoff, 225

exDeps = ['punct', 'compound'] #dependencies which can be ignored as they are unhelpful 

In [None]:
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'], timeout=50000, memory='24G') as client: 
    for m in range (0, currentDB.shape[0]): #for each tweet in currentDB: 
        messageOG =  currentDB.loc[m, 'tweet'] #select current tweet 
        messageVoteRatio = currentDB.loc[m,'hate_speech'] / currentDB.loc[m,'count'] #retrieve original vote ratio of hate vs total
        messageClass = currentDB.loc[m, 'class'] #retrieve original class given
        
        print('\nOriginal Message: ', messageOG, '\n')
        document = client.annotate(messageOG) #run nlp on current tweet
        sentences = document.sentence #shortened here for convenience
        
        
        #create dataframes to be used for analysis of current tweet
        tags = pd.DataFrame(columns = ['sentence', 'token', 'term', 'original'])
        tTags = pd.DataFrame(columns = ['sentence', 'token', 'term', 'original'])
        termInfo = pd.DataFrame(columns = ambiguousHateBaseDB.columns.values.tolist()) 
        tTermInfo = pd.DataFrame(columns = ambiguousHateBaseDB.columns.values.tolist()) 
        flags = pd.DataFrame(columns = ['sentence', 'edge', 'source', 'source words', 'target', 'target words',\
                                        'dependencies', 'modifier'])
        
        #create variables to be used for analysis of current tweet
        tempTwoList = []
        splitTermList = []
        messageArray = messageOG.split()
        cleanMessage = ''
        label = 0 #0 = clean, 1 = warning, 2 = block certain words, 3 = block entire message
        loopBool = False 
        offScore = 0
        ampScore = 0
        
        
        #add tags for two word terms
        for a in range(0, len(messageArray)-1): 
            tString = ''
            tString = messageArray[a] + ' ' + messageArray[a+1]
            for element in unambiguousHateTermsList: 
                if lev.jaro(tString.lower(), element.lower()) >= cutoff: 
                    label = label + 2
                    tempTwoList.append(tString.lower().split())
                    termInfo = addInfo(termInfo, ambiguousHateBaseDB, unambiguousHateBaseDB, amOffAverage, unOffAverage, element.lower()) 
            for element in ambiguousHateTermsList: 
                if lev.jaro(tString.lower(), element.lower()) >= cutoff: 
                    tempTwoList.append(tString.lower().split())
                    termInfo = addInfo(termInfo, ambiguousHateBaseDB, unambiguousHateBaseDB, amOffAverage, unOffAverage, element.lower()) 
        
        for element in termInfo['term']: 
            splitTermList.append(element.split())
        
        
        #add tags for single word terms
        for s in range (0, len(sentences)): 
            for t in range(0, len(sentences[s].token)): 
                if any(sentences[s].token[t].value.lower() in sublist for sublist in tempTwoList): 
                    addTags(tags, s, t, sentences[s].token[t].value.lower(), sentences[s].token[t].value)
                else: 
                    loopBool = False
                    for element in unambiguousHateTermsList: #ambiguous originally
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loopBool == False: 
                            label = label + 2
                            addTags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loopBool = True #used to prevent needlessly going through the rest of the list if a match is found
                    for element in ambiguousHateTermsList: #unambiguous originally
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loopBool == False: 
                            addTags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loopBool = True
                    for element in amplifiersList: 
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loopBool == False: 
                            addTags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loopBool = True
                            
                            
        #detect dependencies for tagged words
        if np.isnan(tags['sentence'].max()): #if no tags were detected
            tsLength = 0
        else: 
            tsLength = int(tags['sentence'].max()+1)
        for s in range(0, tsLength): #for each sentence up to the last one a tag was detected
            tTags = tags.loc[(tags['sentence'] == s)] #select all tags for current sentence
            for e in range(0, len(sentences[s].basicDependencies.edge)): #for each edge generated for current sentence 
                current = sentences[s].basicDependencies.edge[e] #created for abbreviation 
                if current.dep not in exDeps: #if the dependency detected is not one of the ones we choose to ignore (punct, etc)
                    if current.source-1 in set(tTags['token']): #if the dependencies source token has been tagged
                        tTarget = sentences[s].token[current.target-1].value #created for abbreviation
                        if tTarget in set(tTags['term']): #if dependency target is a tagged word
                            tTarget = tTags.loc[(tTags['token'] == current.target-1)]['term'].iloc[0] #created for abbreviation
                        addFlags(flags, s, e, current.source, tTags.loc[(tTags['token'] == current.source-1)]['term'].iloc[0], current.target, \
                                 tTarget, current.dep, 'target words') #add data from dependency to flags
                    elif current.target-1 in set(tTags['token']): #if the dependencies target token has been tagged 
                        tSource = sentences[s].token[current.source-1].value #created for abbreviation
                        if tSource in set(tTags['term']): #if dependency source is a tagged word
                            tSource = tTags.loc[(tTags['token'] == current.source-1)]['term'].iloc[0] #created for abbreviation
                        addFlags(flags, s, e, current.source, tSource, current.target, \
                                 tTags.loc[(tTags['token'] == current.target-1)]['term'].iloc[0], current.dep, \
                                 'source words') #add data from dependency to flags
                        
                        
        #retrieve full data on file for problem words 
        for index, row in tags.iterrows(): 
            #check for no duplicates of two word terms
            if not any(row['term'] in sublist for sublist in splitTermList): 
                #add info from HateBase DB to termInfo
                termInfo = addInfo(termInfo, ambiguousHateBaseDB, unambiguousHateBaseDB, amOffAverage, unOffAverage, row['term']) 
                
                
        #check and sum offensiveness for terms (for those that have one)
        offScore = np.nansum(termInfo.loc[termInfo['is_unambiguous'] == False]['average_offensiveness'])
        #the False unambiguous is to make sure we're not double counting unambiguous terms when they already auto get their label boosted. 
        
                    
        for index, row in flags.iterrows(): #for each flag
            for element in amplifiersList: #for each element in amplifiers
                if lev.jaro(row[row['modifier']], element.lower()) >= cutoff: 
                    if row['modifier'] == 'source words' and row['target words'] in termInfo['term']: 
                        offScore += termInfo.loc[termInfo['term'] == str(row['target words'])].loc[0,'average_offensiveness']
                    elif row['modifier'] == 'target words' and row['source words'] in termInfo['term']:
                        offScore += termInfo.loc[termInfo['term'] == str(row['source words'])].loc[0,'average_offensiveness']
        
        if offScore >= offCutoff2: 
            label += 2
        elif offScore >= offCutoff: 
            label += 1
                    
                    
        #begin creating a list of descriptions for offending words   
        meaningList = termInfo['hateful_meaning'].tolist()
        meaningList = [x.lower() for x in meaningList if str(x) != 'nan'] 
        meaningModify = []

        for m in meaningList: 
            #this is to trim out descrptions with multiple sections in the form of [1]...[2]...etc and only select the first part of the description 
            if ']' in m: 
                temp = m.split(']')[1].split('[')[0].lstrip().rstrip()
                meaningModify.append(temp.split('.')[0].replace('.', '').split(',')[0])
            else: 
                meaningModify.append(m.lstrip().rstrip().split('.')[0].replace('.', '').split(',')[0])
                
             
        #create a description for offensive words that are not given one in the Hatebase DB
        aboutIndex = []
        aboutArray = ['nationality', 'ethnicity', 'religion', 'gender', 'sexual orientation', 'disability', 'class']

        for index, row in termInfo.iterrows(): 
            thateful = str(row['hateful_meaning'])
            if thateful == 'nan' or thateful == '': 
                tTermInfo = tTermInfo.append(termInfo.loc[index,:])
        
        if not tTermInfo.empty: 
            if True in set(tTermInfo['is_about_nationality']): aboutIndex.append(0) 
            if True in set(tTermInfo['is_about_ethnicity']): aboutIndex.append(1) 
            if True in set(tTermInfo['is_about_religion']): aboutIndex.append(2)
            if True in set(tTermInfo['is_about_gender']): aboutIndex.append(3)
            if True in set(tTermInfo['is_about_sexual_orientation']): aboutIndex.append(4)
            if True in set(tTermInfo['is_about_disability']): aboutIndex.append(5) 
            if True in set(tTermInfo['is_about_class']): aboutIndex.append(6) 

        for a in range(0, len(aboutIndex)): 
            meaningModify.append('a person\'s ' + aboutArray[aboutIndex[a]])
            
            
        #change to a set to eliminate any possible duplicate descriptions
        meaningModifyOG = meaningModify #maintain all original meanings for replacement later
        meaningModify = set(meaningModify)
        meaningModify = list(meaningModify)
        
        
        #join the descriptions in a readable manner. 
        meaningString = " and ".join([", ".join(meaningModify[:-1]), meaningModify[-1]] if len(meaningModify) > 2 else meaningModify) 
        
        meaningString = meaningString + '.' #add a period to the end of the description. 
        
        
        if label == 0: 
            print('This message is clean of hate speech.')
            rwarning = 'This message is clean of hate speech.'
            addResults(results, label, messageOG, cleanMessage, rwarning, messageVoteRatio, messageClass)
            print(messageOG)
            
            
        if label == 1: 
            print('WARNING: This message may contain some hate speech about', meaningString, "\n") 
            rwarning = 'WARNING: This message may contain some hate speech about '+meaningString
            addResults(results, label, messageOG, cleanMessage, rwarning, messageVoteRatio, messageClass)
            print(messageOG)
            
            
        if label == 2: 
            print('WARNING: Parts of this message have been censored due to hate speech about', meaningString) 
            cleanArray = []
            count = 0
            cleanBool = False
            
            #run through message checking for offending words and replacing them with their meanings
            for e in range(0, len(messageArray)):                 
                cleanBool = False
                #check for single word tags
                cleanE = messageArray[e].replace('.', '').replace('\"', '')
                for index, row in tags.iterrows(): 
                    if cleanE == row['original'] and row['term'] in set(termInfo['term']) and cleanBool == False:  
                        tmeaning = meaningModifyOG[count]
                        if tmeaning [0:2] == 'a ': 
                            tmeaning = tmeaning[2:]
                        cleanArray.append("\"" + tmeaning + "\"")
                        count = count+1
                        cleanBool = True
                #check for two word terms from termInfo
                if e < len(messageArray)-2 and cleanBool == False: 
                    tString = messageArray[e] + ' ' + messageArray[e+1]
                    if tString.replace('.', '').replace('\"', '') in set(termInfo['term']): 
                        tmeaning = meaningModifyOG[count]
                        if tmeaning [0:2] == 'a ': 
                            tmeaning = tmeaning[2:]
                        cleanArray.append("\"" + meaningModifyOG[count] + "\"")
                        count = count+1
                    else: 
                        cleanArray.append(messageArray[e])
                #if this part of messageArray isn't tagged add the original message part
                elif cleanBool == False: 
                    cleanArray.append(messageArray[e])
            
            #add spaces to cleanMessage created above to make it readable
            cleanMessage = ' '.join([str(elem) for elem in cleanArray]) + '.'
            
            rwarning = 'WARNING: Parts of this message have been censored due to hate speech.'
            addResults(results, label, messageOG, cleanMessage, rwarning, messageVoteRatio, messageClass)
            print(cleanMessage) 
            
            
        if label >= 3: 
            print('This message has been blocked due to hate speech about', meaningString) 
            rwarning = 'This message has been blocked due to hate speech about '+ meaningString
            addResults(results, label, messageOG, cleanMessage, rwarning, messageVoteRatio, messageClass)
            
            
        print('Client Finished')

In [None]:
results.shape

In [None]:
results.label.value_counts()

In [None]:
results.classOG.value_counts()
#Class 0 = Hate Speech, Class 1 = Offensive Language, Class 2 = Neither

In [None]:
results.hate_ratio.value_counts()

In [None]:
pd.crosstab(results.label,results.hate_ratio)

In [None]:
results.to_csv('full_results.csv')