In [None]:
#imports
import stanfordnlp
import pandas as pd
import numpy as np

#import re

import Levenshtein as lev
from stanfordnlp.server import CoreNLPClient

from os import getcwd
from os import environ

#set environment variable
cwd = getcwd()
environ['CORENLP_HOME'] = cwd + '\\corenlp\\stanford-corenlp-full-2018-10-05'

In [None]:
#this import is seperate in case you are running the client without using a gpu
import torch

In [None]:
#define custom functions for convenience and saving lines of code

#add final results of algorithm to results dataframe
def add_results(results, rlabel, rmessage_OG, rclean_message, rWarning, rRatio, rClass): 
    results.loc[-1] = [rlabel, rmessage_OG, rclean_message, rWarning, rRatio, rClass]
    results.index = results.index+1
    results.reindex(index=results.index[::-1])

#add detected taged words
def add_tags(tags, tsentence, ttoken, tvalue, toriginal): 
    tags.loc[-1] = [tsentence, ttoken, tvalue, toriginal]
    tags.index = tags.index+1
    tags.reindex(index=tags.index[::-1])

#add detected dependencies from detected words 
def add_flags(flags, fsentence, fedge, fsources, fsource_words, ftargets, ftarget_words, fdependencies, forigin):
    #for forigin the opposite of what triggered the detection is what will be placed there for amplifier detection 
    #convenience later (for example if the target is nigger, the forigin will be 'source words' to check for amplifiers)
    flags.loc[-1] = [fsentence, fedge, fsources, fsource_words, ftargets, ftarget_words, fdependencies, forigin]
    flags.index = flags.index + 1
    
    #flags = flags.sort_index(inplace=True)
    flags.reindex(index=flags.index[::-1])

#add info for detected words from Hatebase data
def add_info(term_info, ambiguous_hate_base_DB, unambiguous_hate_base_DB, am_off_average, un_off_average, element):    
    term_info = term_info.append(ambiguous_hate_base_DB.loc[ambiguous_hate_base_DB['term'].str.lower()\
                                                       == element.lower()], ignore_index=True) 
    term_info['average_offensiveness'].fillna(am_off_average, inplace=True)
    term_info = term_info.append(unambiguous_hate_base_DB.loc[unambiguous_hate_base_DB['term'].str.lower()\
                                                       == element.lower()], ignore_index=True) 
    term_info['average_offensiveness'].fillna(un_off_average, inplace=True)
    term_info['hateful_meaning'] = term_info['hateful_meaning'].str.lower() 
    return term_info

In [None]:
#read in Hatebase data, divided on ambiguity 
ambiguous_hate_base_DB = pd.read_csv("total_ambiguous_results.csv", index_col=False)
unambiguous_hate_base_DB = pd.read_csv('total_unambiguous_results.csv', index_col=False)
#read in amplifiers list
amplifiers_DB = pd.read_csv('noswearing_trim_data.csv', index_col=False) 

#form lists from above data for convenience 
amplifiers_list = amplifiers_DB['term']
ambiguous_hate_terms_list = ambiguous_hate_base_DB['term']
unambiguous_hate_terms_list = unambiguous_hate_base_DB['term']

#calcualte averages from Hatebase lists
un_off_average = unambiguous_hate_base_DB["average_offensiveness"].mean()
am_off_average = ambiguous_hate_base_DB['average_offensiveness'].mean()

In [None]:
#read in tweet data
full_messages_DB = pd.read_csv('davidson_labeled_data_replaced.csv')
hate_messages_DB = pd.read_csv("davidson_hate_only.csv", index_col=False)
clean_messages_DB = pd.read_csv("davidson_clean_only.csv", index_col=False)

In [None]:
#create dataframe to store final results of client
results = pd.DataFrame(columns = ['label', 'message', 'clean', 'warning', 'hate_ratio', 'classOG'])

## Run Client

In [None]:
current_DB = full_messages_DB #used so you only need to edit here to change tweet db being used

#set global variables
cutoff = .95 #jaro similarity cutoff
off_cutoff = 90 #offensive cutoff
off_cutoff_2 = 270 #second offensive cutoff

ex_deps = ['punct', 'compound'] #dependencies which can be ignored as they are unhelpful 

In [None]:
with CoreNLPClient(annotators=['tokenize','ssplit','lemma', 'pos', 'depparse'], timeout=50000, memory='24G') as client: 
    #previous annotators: parse, ner, coref
    for m in range (0, current_DB.shape[0]): #for each tweet in current_DB: 
        message_OG =  current_DB.loc[m, 'tweet'] #select current tweet 
        message_vote_ratio = current_DB.loc[m,'hate_speech'] / current_DB.loc[m,'count'] #retrieve original vote ratio of hate vs total
        message_class = current_DB.loc[m, 'class'] #retrieve original class given
        
        print('\nOriginal Message: ', message_OG, '\n')
        document = client.annotate(message_OG) #run nlp on current tweet
        sentences = document.sentence #shortened here for convenience
        
        
        #create dataframes to be used for analysis of current tweet
        tags = pd.DataFrame(columns = ['sentence', 'token', 'term', 'original'])
        t_tags = pd.DataFrame(columns = tags.columns.values.tolist())
        term_info = pd.DataFrame(columns = ambiguous_hate_base_DB.columns.values.tolist()) 
        t_term_info = pd.DataFrame(columns = ambiguous_hate_base_DB.columns.values.tolist()) 
        flags = pd.DataFrame(columns = ['sentence', 'edge', 'source', 'source words', 'target', 'target words',\
                                        'dependencies', 'modifier'])
        
        #create variables to be used for analysis of current tweet
        temp_two_list, split_term_list = [], []
        message_array = message_OG.split()
        clean_message = ''
        loop_bool = False 
        label = 0 #0 = clean, 1 = warning, 2 = block certain words, 3 = block entire message
        off_score = 0
        
        
        #add tags for two word terms
        for a in range(0, len(message_array)-1): 
            t_string = message_array[a] + ' ' + message_array[a+1]
            for element in unambiguous_hate_terms_list: 
                if lev.jaro(t_string.lower(), element.lower()) >= cutoff: 
                    label = label + 2
                    temp_two_list.append(t_string.lower().split())
                    term_info = add_info(term_info, ambiguous_hate_base_DB, unambiguous_hate_base_DB, am_off_average, un_off_average, element.lower()) 
            for element in ambiguous_hate_terms_list: 
                if lev.jaro(t_string.lower(), element.lower()) >= cutoff: 
                    temp_two_list.append(t_string.lower().split())
                    term_info = add_info(term_info, ambiguous_hate_base_DB, unambiguous_hate_base_DB, am_off_average, un_off_average, element.lower()) 
        
        for element in term_info['term']: 
            split_term_list.append(element.split())
        
        
        #add tags for single word terms
        for s in range (0, len(sentences)): 
            for t in range(0, len(sentences[s].token)): 
                if any(sentences[s].token[t].value.lower() in sublist for sublist in temp_two_list): 
                    add_tags(tags, s, t, sentences[s].token[t].value.lower(), sentences[s].token[t].value)
                else: 
                    loop_bool = False
                    for element in unambiguous_hate_terms_list: #ambiguous originally
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loop_bool == False: 
                            label = label + 2
                            add_tags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loop_bool = True #used to prevent needlessly going through the rest of the list if a match is found
                    for element in ambiguous_hate_terms_list: #unambiguous originally
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loop_bool == False: 
                            add_tags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loop_bool = True
                    for element in amplifiers_list: 
                        if lev.jaro(sentences[s].token[t].value.lower(), element.lower()) >= cutoff and loop_bool == False: 
                            add_tags(tags, s, t, element.lower(), sentences[s].token[t].value)
                            loop_bool = True
                            
                            
        #detect dependencies for tagged words
        if np.isnan(tags['sentence'].max()): #if no tags were detected
            ts_length = 0
        else: 
            ts_length = int(tags['sentence'].max()+1)
        for s in range(0, ts_length): #for each sentence up to the last one a tag was detected
            t_tags = tags.loc[(tags['sentence'] == s)] #select all tags for current sentence
            for e in range(0, len(sentences[s].basicDependencies.edge)): #for each edge generated for current sentence 
                current = sentences[s].basicDependencies.edge[e] #created for abbreviation 
                if current.dep not in ex_deps: #if the dependency detected is not one of the ones we choose to ignore (punct, etc)
                    if current.source-1 in set(t_tags['token']): #if the dependencies source token has been tagged
                        tTarget = sentences[s].token[current.target-1].value #created for abbreviation
                        if tTarget in set(t_tags['term']): #if dependency target is a tagged word
                            tTarget = t_tags.loc[(t_tags['token'] == current.target-1)]['term'].iloc[0] #created for abbreviation
                        add_flags(flags, s, e, current.source, t_tags.loc[(t_tags['token'] == current.source-1)]['term'].iloc[0], current.target, \
                                 tTarget, current.dep, 'target words') #add data from dependency to flags
                    elif current.target-1 in set(t_tags['token']): #if the dependencies target token has been tagged 
                        t_source = sentences[s].token[current.source-1].value #created for abbreviation
                        if t_source in set(t_tags['term']): #if dependency source is a tagged word
                            t_source = t_tags.loc[(t_tags['token'] == current.source-1)]['term'].iloc[0] #created for abbreviation
                        add_flags(flags, s, e, current.source, t_source, current.target, \
                                 t_tags.loc[(t_tags['token'] == current.target-1)]['term'].iloc[0], current.dep, \
                                 'source words') #add data from dependency to flags
                        
                        
        #retrieve full data on file for problem words 
        for index, row in tags.iterrows(): 
            #check for no duplicates of two word terms
            if not any(row['term'] in sublist for sublist in split_term_list): 
                #add info from HateBase DB to term_info
                term_info = add_info(term_info, ambiguous_hate_base_DB, unambiguous_hate_base_DB, am_off_average, un_off_average, row['term']) 
                
                
        #check and sum offensiveness for terms (for those that have one)
        off_score = np.nansum(term_info.loc[term_info['is_unambiguous'] == False]['average_offensiveness'])
        #the False unambiguous is to make sure we're not double counting unambiguous terms when they already auto get their label boosted. 
        
                    
        for index, row in flags.iterrows(): #for each flag
            for element in amplifiers_list: #for each element in amplifiers
                if lev.jaro(row[row['modifier']], element.lower()) >= cutoff: 
                    if row['modifier'] == 'source words' and row['target words'] in term_info['term']: 
                        off_score += term_info.loc[term_info['term'] == str(row['target words'])].loc[0,'average_offensiveness']
                    elif row['modifier'] == 'target words' and row['source words'] in term_info['term']:
                        off_score += term_info.loc[term_info['term'] == str(row['source words'])].loc[0,'average_offensiveness']
        
        if off_score >= off_cutoff_2: 
            label += 2
        elif off_score >= off_cutoff: 
            label += 1
                    
                    
        #begin creating a list of descriptions for offending words   
        meaning_list = term_info['hateful_meaning'].tolist()
        meaning_list = [x.lower() for x in meaning_list if str(x) != 'nan'] 
        meaning_modify = []

        for m in meaning_list: 
            #this is to trim out descrptions with multiple sections in the form of [1]...[2]...etc and only select the first part of the description 
            if ']' in m: 
                temp = m.split(']')[1].split('[')[0].lstrip().rstrip()
                meaning_modify.append(temp.split('.')[0].replace('.', '').split(',')[0])
            else: 
                meaning_modify.append(m.lstrip().rstrip().split('.')[0].replace('.', '').split(',')[0])
                
             
        #create a description for offensive words that are not given one in the Hatebase DB
        about_index = []
        about_array = ['nationality', 'ethnicity', 'religion', 'gender', 'sexual orientation', 'disability', 'class']

        for index, row in term_info.iterrows(): 
            thateful = str(row['hateful_meaning'])
            if thateful == 'nan' or thateful == '': 
                t_term_info = t_term_info.append(term_info.loc[index,:])
        
        if not t_term_info.empty: 
            if True in set(t_term_info['is_about_nationality']): about_index.append(0) 
            if True in set(t_term_info['is_about_ethnicity']): about_index.append(1) 
            if True in set(t_term_info['is_about_religion']): about_index.append(2)
            if True in set(t_term_info['is_about_gender']): about_index.append(3)
            if True in set(t_term_info['is_about_sexual_orientation']): about_index.append(4)
            if True in set(t_term_info['is_about_disability']): about_index.append(5) 
            if True in set(t_term_info['is_about_class']): about_index.append(6) 

        for a in range(0, len(about_index)): 
            meaning_modify.append('a person\'s ' + about_array[about_index[a]])
            
            
        #change to a set to eliminate any possible duplicate descriptions
        meaning_modify_OG = meaning_modify.copy() #maintain all original meanings for replacement later
        meaning_modify = set(meaning_modify)
        meaning_modify = list(meaning_modify)
        
        
        #join the descriptions in a readable manner. 
        meaning_string = " and ".join([", ".join(meaning_modify[:-1]), meaning_modify[-1]] if len(meaning_modify) > 2 else meaning_modify) 
        meaning_string = meaning_string + '.' #add a period to the end of the description. 
        
        
        if label == 0: 
            print('This message is clean of hate speech.')
            rwarning = 'This message is clean of hate speech.'
            add_results(results, label, message_OG, clean_message, rwarning, message_vote_ratio, message_class)
            print(message_OG)
            
            
        if label == 1: 
            print('WARNING: This message may contain some hate speech about ', meaning_string, "\n") 
            rwarning = 'WARNING: This message may contain some hate speech about '+ meaning_string
            add_results(results, label, message_OG, clean_message, rwarning, message_vote_ratio, message_class)
            print(message_OG)
            
            
        if label == 2: 
            print('WARNING: Parts of this message have been censored due to hate speech about', meaning_string) 
            
            clean_array, count, clean_bool = [], 0, False
            
            #run through message checking for offending words and replacing them with their meanings
            for e in range(0, len(message_array)):                 
                clean_bool = False
                #check for single word tags
                cleanE = message_array[e].replace('.', '').replace('\"', '')
                for index, row in tags.iterrows(): 
                    if cleanE == row['original'] and row['term'] in set(term_info['term']) and clean_bool == False:  
                        tmeaning = meaning_modify_OG[count]
                        if tmeaning [0:2] == 'a ': 
                            tmeaning = tmeaning[2:]
                        clean_array.append("\"" + tmeaning + "\"")
                        count = count+1
                        clean_bool = True
                #check for two word terms from term_info
                if e < len(message_array)-2 and clean_bool == False: 
                    t_string = message_array[e] + ' ' + message_array[e+1]
                    if t_string.replace('.', '').replace('\"', '') in set(term_info['term']): 
                        tmeaning = meaning_modify_OG[count]
                        if tmeaning [0:2] == 'a ': 
                            tmeaning = tmeaning[2:]
                        clean_array.append("\"" + meaning_modify_OG[count] + "\"")
                        count = count+1
                    else: 
                        clean_array.append(message_array[e])
                #if this part of message_array isn't tagged add the original message part
                elif clean_bool == False: 
                    clean_array.append(message_array[e])
            
            #add spaces to clean_message created above to make it readable
            clean_message = ' '.join([str(elem) for elem in clean_array]) + '.'
            
            rwarning = 'WARNING: Parts of this message have been censored due to hate speech.'
            add_results(results, label, message_OG, clean_message, rwarning, message_vote_ratio, message_class)
            print(clean_message) 
            
            
        if label >= 3: 
            print('This message has been blocked due to hate speech about', meaning_string) 
            rwarning = 'This message has been blocked due to hate speech about ' + meaning_string
            add_results(results, label, message_OG, clean_message, rwarning, message_vote_ratio, message_class)
            
            
        print('Client Finished')

In [None]:
results.shape

In [None]:
results.label.value_counts()

In [None]:
results.classOG.value_counts()
#Class 0 = Hate Speech, Class 1 = Offensive Language, Class 2 = Neither

In [None]:
results.hate_ratio.value_counts()

In [None]:
pd.crosstab(results.label,results.hate_ratio)

In [None]:
results.to_csv('full_results_new.csv')