Importing libraries

In [25]:
import json
from collections import defaultdict, Counter
import ast
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
# import torch_geometric as tg
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder

from sentence_transformers import SentenceTransformer


Pandas setup

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)  # Limit number of rows displayed
pd.set_option('display.width', 1000)  # Set max width for table
pd.set_option('display.colheader_justify', 'center')  # Center-align column headers

Method for cleaning the data

In [4]:
def clean_special_chars(value):
    if isinstance(value, str):  
        return value.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('  ', ' ').strip()
    return value 

Reading gab

In [5]:
content_gab = pd.read_csv('gab_reddit_benchmark/gab.csv')

content_gab["text"] = content_gab["text"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 
content_gab["response"] = content_gab["response"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="")
content_gab["hate_speech_idx"] = content_gab["hate_speech_idx"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 

# content_gab["text"] = content_gab["text"].apply(clean_special_chars)
# content_gab["response"] = content_gab["response"].apply(clean_special_chars)

for index, row in content_gab.iterrows():
    row['text'] = row['text'].replace("'", '"')
    row['response'] = row['response'].replace("'", '"')

# content_gab = content_gab.applymap(clean_special_chars)
print(content_gab.head())
print('\n- - - - - -\n')
print(content_gab.columns)
print('\n- - - - - -\n')
print(content_gab.iloc[1]['text'])

                        id                                              text                        hate_speech_idx                      response                     
0                                  1. 39869714\n  1. i joined gab to remind myself how retarded ...        [1]       ["Using words that insult one group while defe...
1  1. 39845588\n2. \t39848775\n3. \t\t39911017\n  1. This is what the left is really scared of. ...        [3]       ['You can disagree with someones opinion witho...
2                   1. 37485560\n2. \t37528625\n  1. It makes you an asshole.\n2. \tGive it to a...        [2]       ['Your argument is more rational if you leave ...
3                   1. 39787626\n2. \t39794481\n  1. So they manage to provide a whole lot of da...        [2]       ["You shouldn't generalize a specific group or...
4  1. 37957930\n2. \t39953348\n3. \t\t39965219\n  1. Hi there, i,m Keith, i hope you are doing w...        [3]       ['If someone is rude it is better to ignore th..

In [6]:
def mark_text_labels(text_utterances_length, labels):
    if not labels:
        return ['other'] * text_utterances_length
    new_labels = []
    int_list = ast.literal_eval(labels)
    for i in range(text_utterances_length):
        if i+1 in int_list:
            new_labels.append('hate_speech')
        else:
            new_labels.append('other')
    return new_labels

Splitting 'text' and 'response' into individual rows, so that I can construct a graph from it

In [7]:
text_column = []
text_labels_column = []
response_column = []
response_labels_column = []

for index, row in content_gab.iterrows():
    text_utterances = row['text'].split('\n')
    text_utterances = list(filter(None, text_utterances))

    for i, t in enumerate(text_utterances):
        text_utterances[i] = clean_special_chars(t)

    text_labels = mark_text_labels(len(text_utterances), row['hate_speech_idx'])

    response_utterances = ast.literal_eval(row['response']) if row['response'] else []
    for i, r in enumerate(response_utterances):
        response_utterances[i] = clean_special_chars(r)
    response_labels = ['other'] * len(response_utterances)  
    
    text_column.append(text_utterances)
    text_labels_column.append(text_labels)
    response_column.append(response_utterances)
    response_labels_column.append(response_labels)

content_gab['text'] = text_column
content_gab['hate_speech_idx'] = text_labels_column
content_gab['response'] = response_column
content_gab['response_labels'] = response_labels_column

content_gab = content_gab.rename(columns={'hate_speech_idx': 'text_labels'})
print(content_gab.head())
print('- - - - ')
print(content_gab.columns)

for index, row in content_gab.iterrows():
    if index == 0:
        continue
    print(row['id'])
    print(row['text'])
    print(row['text_labels'])
    print(row['response'])
    print(row['response_labels'])
    break

                        id                                              text                                text_labels                               response                         response_labels    
0                                  1. 39869714\n  [1. i joined gab to remind myself how retarded...                [hate_speech]  [Using words that insult one group while defen...  [other, other, other]
1  1. 39845588\n2. \t39848775\n3. \t\t39911017\n  [1. This is what the left is really scared of....  [other, other, hate_speech]  [You can disagree with someones opinion withou...  [other, other, other]
2                   1. 37485560\n2. \t37528625\n  [1. It makes you an asshole., 2. Give it to a ...         [other, hate_speech]  [Your argument is more rational if you leave y...  [other, other, other]
3                   1. 39787626\n2. \t39794481\n  [1. So they manage to provide a whole lot of d...         [other, hate_speech]  [You shouldn't generalize a specific group or ...  [other,

Encoding the labels

In [12]:
label_encoder = LabelEncoder()
content_gab['text_labels_encoded'] = content_gab['text_labels'].apply(label_encoder.fit_transform)
content_gab['response_labels_encoded'] = content_gab['response_labels'].apply(label_encoder.fit_transform)

Creating BERT encoding method

In [8]:
bert = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(sentences):
    if isinstance(sentences, list):
        return bert.encode(sentences, show_progress_bar=True).tolist()
    elif isinstance(sentences, str):
        return bert.encode([sentences], show_progress_bar=True).tolist()
    return []

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating BERT embeddings

In [27]:
content_gab = content_gab[:100]
before = time.time()
content_gab['text_embeddings'] = content_gab['text'].apply(generate_embeddings)
after_text = time.time()
print(content_gab.iloc[1]['text_embeddings'])
print('\nTIME FOR TEXT EMBEDDINGS: ', after_text - before)
print('\n- - - - - -\n')
content_gab['response_embeddings'] = content_gab['response'].apply(generate_embeddings)
after_response = time.time()
print(content_gab.iloc[2]['response_embeddings'])
print('\nTIME FOR RESPONSE EMBEDDINGS: ', after_response - after_text)
print('\n- - - - - -\n')

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.70it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 70.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 142.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 129.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 124.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 133.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 168.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 116.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 261.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 71.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 166.67it/s]


[[-0.02357207052409649, 0.01794441044330597, 0.040565621107816696, 0.06729651242494583, 0.09804011136293411, 0.035114090889692307, 0.06723959743976593, -0.07981333136558533, 0.012592200189828873, -0.06395196169614792, 0.014613348990678787, -0.02868610993027687, 0.0655745416879654, -0.051386862993240356, -0.1029239073395729, 0.015551786869764328, -0.06676267087459564, -0.002904549241065979, -0.027871331200003624, 0.060362834483385086, -0.027235589921474457, 0.026327308267354965, 0.03128805011510849, 0.01742401532828808, 0.01389362197369337, -0.06276202201843262, -0.01378923561424017, -0.015726247802376747, -0.035311464220285416, -0.05476393923163414, 0.013463089242577553, -0.028276393190026283, -0.031202292069792747, -0.054336193948984146, -0.011610069312155247, -0.0412982776761055, 0.106980100274086, -0.0502496212720871, -0.02981392852962017, 0.06209121644496918, -0.017547212541103363, -0.015013362281024456, 0.08790481835603714, 0.07822368294000626, -0.0915985107421875, 0.0481597743928

Batches: 100%|██████████| 1/1 [00:00<00:00, 117.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 142.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 107.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 158.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 125.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 126.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 153.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 131.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 96.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 109.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 124.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 153.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 111.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 158.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 109.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 153.5

[[0.09269341081380844, 0.005344010889530182, -0.021850131452083588, -0.0007700659334659576, 0.048732686787843704, 0.02857472375035286, 0.06419847905635834, -0.009560782462358475, 0.0533612035214901, -0.08790566772222519, -0.014494829811155796, -0.0013366986531764269, 0.09828135371208191, -0.013169709593057632, 0.03653039038181305, 0.08743909001350403, 0.05540042743086815, 0.0015358084347099066, -0.03637179359793663, 0.008946459740400314, -0.08317055553197861, 0.0931687131524086, -0.046798188239336014, -0.02023383416235447, -0.04995937645435333, -0.0417790524661541, -0.01558983139693737, 0.03397413715720177, -0.014390988275408745, 0.10862996429204941, 0.020122459158301353, -0.013854064978659153, 0.04655632749199867, 0.014802719466388226, 0.0009073952678591013, -0.0037358901463449, 0.007216255180537701, 0.047422830015420914, 0.015439006499946117, 0.013636789284646511, 0.012506755068898201, 0.00898042693734169, -0.002120432211086154, -0.07505079358816147, -0.0013401499018073082, -0.000708




Method for constructing graphs

In [43]:
def construct_graph(row):
    text_utterances = row['text_embeddings']
    response_utterances = row['response_embeddings']

    root = text_utterances[0]
    children = text_utterances[1:] + response_utterances
    num_nodes = len(children) #+1

    # edge_index = torch.tensor(
    #     [[0]*num_nodes, list(range(1, num_nodes)
    # )], dtype=torch.long)
    # edge_index = torch.tensor(
    #     [[0, i] for i in range(1, num_nodes)], dtype=torch.long
    # ).t().contiguous()
    edge_index = torch.tensor(
        [[0] * len(children), list(range(1, num_nodes))], dtype=torch.long
    )
    
    labels = torch.tensor([row['text_labels_encoded'] + row['response_labels_encoded']], dtype=torch.long)

    node_features = torch.tensor([root] + children, dtype=torch.float)
    data = Data(x=node_features, edge_index=edge_index, y=labels)
    return data

Constructing graphs for all rows

In [44]:
graphs = []
for index, row in content_gab.iterrows():
    graphs.append(construct_graph(row))

print(graphs[0])
print('\n- - - - - -\n')
print(f"Number of nodes: {graphs[1].num_nodes}")
print(f"Number of edges: {graphs[1].num_edges}")

ValueError: expected sequence of length 3 at dim 1 (got 2)

Cross-validation 

In [None]:
labels = content_gab['text_labels_encoded']
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=36851234)
folds = list(rskf.split(graphs, labels))

GraphNN model class

In [None]:
class GraphNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.conv2(x, edge_index)
        return x

Method for training the NN

In [16]:
def train(model, optimizer, criterion, data):
    for train_idx, test_idx in folds:
        train_graphs = [graphs[i] for i in train_idx]
        test_graphs = [graphs[i] for i in test_idx]

        model.train()
        for data in train_graphs:
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()

        model.eval()
        correct = 0
        total = 0
        for data in test_graphs:
            out = model(data)
            pred = out.argmax(dim=1)
            correct += int((pred == data.y).sum())
            total += len(pred)
    print(f"Accuracy: {correct/total}")

Train the NN

In [None]:
input_dim = graphs[0].x.shape[1]    # embedding dimensionality
hidden_dim = 64
output_dim = len(label_encoder.classes_)

model = GraphNN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

train(model, optimizer, criterion, graphs)