In [1]:
import pickle
import shap
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from experiments import HateXplainExperiments
from nltk.tokenize import word_tokenize
import math
from collections import Counter


def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

def normalize(seq):
    extremum = max(abs(i) for i in seq)
    return [i / extremum for i in seq]

In [9]:
comments = pd.read_csv("toxic_spans/data/comments.csv")
annotations = pd.read_csv("toxic_spans/data/annotations.csv")
spans = pd.read_csv("toxic_spans/data/spans.csv")
#vectorizer = pickle.load(open('models/vectorizer.pkl', 'rb'))

In [3]:
df = pd.merge(comments, annotations, on='comment_id')
df = pd.merge(df, spans, on='annotation')

In [4]:
df_test = pd.DataFrame({'comment_text': df['comment_text']})

In [5]:
len(df_test)

46784

### Experiment B based ensemble model

In [6]:
description = "Exponential function applied to tokens in the TF-IDF vectorised training matrix (1) tagged as not indicative in offensive and hate-speech related human rationales and (2) and not unique to sentences labelled as normal [b]"
hatexplain = HateXplainExperiments(description)
dataset = hatexplain.prepare_dataset()
hatexplain.prepare_properties(dataset)
X_train_vect_experiment_B = hatexplain.preprocess_training_data_option_two()
best_algorithm_experiment_B = hatexplain.get_performance_metrics(X_train_vect_experiment_B)

{'nb__alpha': 0.001, 'nb__norm': False, 'sdg__alpha': 0.0001, 'sdg__average': True, 'sdg__class_weight': 'balanced', 'sdg__learning_rate': 'optimal', 'sdg__max_iter': 1000}
{'Description': ['Exponential function applied to tokens in the TF-IDF vectorised training matrix (1) tagged as not indicative in offensive and hate-speech related human rationales and (2) and not unique to sentences labelled as normal [b]'], 'CV score': [0.7456256558590589], 'Accuracy': [0.7339950372208437], 'Balanced accuracy': [0.7175991992197526], 'F1 score': [0.732424495099172], 'Precision': [0.7317787934963752], 'Recall': [0.7339950372208437]}


In [10]:
vectorized_sentences_experiment_B = vectorizer.transform(df_test['comment_text'])
df_test['predictions_experimentB'] = best_algorithm_experiment_B.predict(vectorized_sentences_experiment_B)
df_test[['normal_experimentB', 'toxic_experimentB']] = best_algorithm_experiment_B.predict_proba(vectorized_sentences_experiment_B)

### No injection based ensemble model

In [14]:
description = "No injection"
hatexplain = HateXplainExperiments(description)
dataset = hatexplain.prepare_dataset()
X_train_vect_no_injection = hatexplain.prepare_properties(dataset)
best_algorithm_no_injection = hatexplain.get_performance_metrics()

{'nb__alpha': 2, 'nb__norm': False, 'sdg__alpha': 0.01, 'sdg__average': True, 'sdg__class_weight': 'balanced', 'sdg__learning_rate': 'optimal', 'sdg__max_iter': 1000}
{'Description': ['No injection'], 'CV score': [0.738924729899628], 'Accuracy': [0.7441687344913152], 'Balanced accuracy': [0.7284610646270725], 'F1 score': [0.7427425075862137], 'Precision': [0.7421637386940426], 'Recall': [0.7441687344913152]}


In [15]:
vectorized_sentences_no_injection = vectorizer.transform(df_test['comment_text'])
df_test['predictions_no_injection'] = best_algorithm_no_injection.predict(vectorized_sentences_no_injection)
df_test[['normal_no_injection', 'toxic_no_injection']] = best_algorithm_no_injection.predict_proba(vectorized_sentences_no_injection)

In [16]:
df_test = df_test[(df_test['predictions_experimentB'] != 'normal') & (df_test['predictions_no_injection'] != 'normal')]
df_test = df_test[(df_test['toxic_no_injection'] >= 0.9) & (df_test['toxic_experimentB'] >= 0.9)]
len(df_test)

688

In [18]:
df_sampled = df_test.sample(n=688)
random_sample_sentences = list(df_sampled['comment_text'])

In [19]:
def evaluate_cosine_similarity(features, shap_values, threshold):
    
    # hatexplain_results = []
    # ground_truth_results = []
    
    cosine_similarity_results = {}
    
    for i, sentence in enumerate(random_sample_sentences):

        id_sorted = np.argsort(shap_values[1][i])
        tokens = word_tokenize(sentence)
        idxs = [list(features).index(token.lower()) for token in tokens if token.lower() in features]
        sentence_tokens_sorted = [x for x in id_sorted if x in idxs]     
        contribution_tokens = features[sentence_tokens_sorted] 
            
        hatexplain_tokens = []
        for n, token in enumerate(contribution_tokens):
            shapv = shap_values[1][i][np.where(features == token)[0][0]]
            if shapv >= threshold:
                hatexplain_tokens.append(token)

        start = df[df['comment_text']==sentence]['start'].iloc[0]
        end = df[df['comment_text']==sentence]['end'].iloc[0]

        start = max(0, start)
        end = min(len(sentence), end)
        ground_truth_rationale = sentence[start:end]
        tokens = word_tokenize(ground_truth_rationale)

        hatexplain_tokens_counter = Counter(hatexplain_tokens)
        ground_truth_results_counter = Counter([token.lower() for token in tokens])
        
        try:

            cosine_similarity = counter_cosine_similarity(hatexplain_tokens_counter, ground_truth_results_counter)
            cosine_similarity_results[str(i)] = cosine_similarity

        except:
        
            cosine_similarity_results[str(i)] = 0.0
        
        # ground_truth_results.append(tokens)
        # hatexplain_results.append(hatexplain_tokens)

        
    return cosine_similarity_results

In [22]:
explainer = shap.KernelExplainer(best_algorithm_experiment_B.predict_proba, shap.sample(X_train_vect_experiment_B, 10), model_output='probability')
vectorized_sentences = vectorizer.transform(df_sampled['comment_text'])
shap_values_experiment_B = explainer.shap_values(vectorized_sentences)
features_experiment_B = vectorizer.get_feature_names_out()

In [23]:
results_experiment_B = evaluate_cosine_similarity(features_experiment_B, shap_values_experiment_B, 0.02)  

In [39]:
np.mean(list(results_experiment_B.values()))

0.4945720040082417

In [34]:
explainer = shap.KernelExplainer(best_algorithm_no_injection.predict_proba, shap.sample(hatexplain.X_train_vect, 10), model_output='probability')
vectorized_sentences = vectorizer.transform(df_sampled['comment_text'])
shap_values_no_injection = explainer.shap_values(vectorized_sentences)
features_no_injection = vectorizer.get_feature_names_out()

In [36]:
results_experiment_no_injection = evaluate_cosine_similarity(features_no_injection, shap_values_no_injection, 0.02) 

In [37]:
np.mean(list(results_experiment_no_injection.values()))

0.4673399182499127