Importing libraries

In [48]:
import json
from collections import defaultdict, Counter
import ast
import time
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

import torch
# import torch_geometric as tg
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder

from sentence_transformers import SentenceTransformer

import re
import ast


Pandas setup

In [49]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)  # Limit number of rows displayed
pd.set_option('display.width', 1000)  # Set max width for table
pd.set_option('display.colheader_justify', 'center')  # Center-align column headers

Method for cleaning the data

In [50]:
def clean_special_chars(value):
    if isinstance(value, str):  
        return value.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('  ', ' ').strip()
    return value 

Reading gab

In [51]:
content_gab = pd.read_csv('gab_reddit_benchmark/gab.csv')

content_gab["text"] = content_gab["text"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 
content_gab["response"] = content_gab["response"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="")
content_gab["hate_speech_idx"] = content_gab["hate_speech_idx"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 

# content_gab["text"] = content_gab["text"].apply(clean_special_chars)
# content_gab["response"] = content_gab["response"].apply(clean_special_chars)

for index, row in content_gab.iterrows():
    row['text'] = row['text'].replace("'", '"')
    row['response'] = row['response'].replace("'", '"')

# content_gab = content_gab.applymap(clean_special_chars)
print(content_gab.head(n=10))
print('\n- - - - - -\n')
print(content_gab.columns)
print('\n- - - - - -\n')
print(content_gab.iloc[1]['id'])

                          id                                                text                        hate_speech_idx                      response                     
0                                    1. 39869714\r\n  1. i joined gab to remind myself how retarded ...         [1]      ["Using words that insult one group while defe...
1  1. 39845588\r\n2. \t39848775\r\n3. \t\t3991101...  1. This is what the left is really scared of. ...         [3]      ['You can disagree with someones opinion witho...
2                   1. 37485560\r\n2. \t37528625\r\n  1. It makes you an asshole.\r\n2. \tGive it to...         [2]      ['Your argument is more rational if you leave ...
3                   1. 39787626\r\n2. \t39794481\r\n  1. So they manage to provide a whole lot of da...         [2]      ["You shouldn't generalize a specific group or...
4  1. 37957930\r\n2. \t39953348\r\n3. \t\t3996521...  1. Hi there, i,m Keith, i hope you are doing w...         [3]      ['If someone is rude it 

Merge posts

In [52]:

def get_first_number(input_string):
    match = re.search(r'\d{2,}', input_string)
    if match:
        return int(match.group())
    return None

content_gab['extracted_id'] = content_gab['id'].apply(get_first_number)

# Find duplicate rows based on 'extracted_id'
duplicates = content_gab[content_gab.duplicated(subset=['extracted_id'], keep=False)]
filtered_groups = []
grouped = content_gab.groupby('extracted_id')
for key, group in grouped:
    if len(group) > 1:
        filtered_groups.append(group)

merged_df = pd.concat(filtered_groups, ignore_index=True)

merged_df.to_csv('gab_reddit_benchmark/gab_groups.csv', index=False)

merged_df = grouped.agg({
    'id': ' '.join,
    'text': ' '.join,
    'hate_speech_idx': ' '.join,
    'response': ' '.join
}).reset_index()
merged_df.to_csv('gab_reddit_benchmark/gab_merged.csv')

df = pd.read_csv('gab_reddit_benchmark/gab_merged.csv')
df = df.applymap(lambda x: x.replace('] [', ', ') if isinstance(x, str) else x)
df = df.applymap(lambda x: x.replace(']  [', ', ') if isinstance(x, str) else x)
df = df.applymap(lambda x: 'n/a' if isinstance(x, str) and x.strip() == '' else x)
df.to_csv('gab_reddit_benchmark/gab_merged.csv', index=False)

In [53]:
content_gab_m = pd.read_csv('gab_reddit_benchmark/gab_merged.csv')
content_gab_m = content_gab_m.drop('Unnamed: 0', axis=1)
content_gab_m = content_gab_m.drop('extracted_id', axis=1)


content_gab_m["text"] = content_gab_m["text"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 
content_gab_m["response"] = content_gab_m["response"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="")
content_gab_m["hate_speech_idx"] = content_gab_m["hate_speech_idx"].replace(to_replace=[None, np.nan, "", "nan", "n/a"], value="") 

for index, row in content_gab_m.iterrows():
    row['text'] = row['text'].replace("'", '"')
    row['response'] = row['response'].replace("'", '"')

print(content_gab_m.head(n=10))
print('\n- - - - - -\n')
print(content_gab_m.columns)
print('\n- - - - - -\n')
print(content_gab_m.iloc[0]['id'])

                          id                                                text                        hate_speech_idx                      response                     
0  1. 37011807\r\n2. \t37012801\r\n3. \t\t3701333...  1. 64% of Hispanics have IQ scores too low to ...     [2, 2, 3]    ['Racist and hate speech will not be tolerated...
1                    1. 37012081\r\n 1. 37012081\r\n  1. LYING CUNT BITCH !!\r\n 1. LYING CUNT BITCH...        [1, 1]    ['Please refrain from using derogatory terms f...
2                                    1. 37012105\r\n  1. Nigger Kills 2 cops https://www.google.com/...           [1]    ['The N word is highly offensive.   All ethnic...
3                   1. 37012146\r\n2. \t37013020\r\n  1. STABBED IN NECK   arrested but no descr   #...           [2]    ['Use of the n-word is unacceptable in our dis...
4  1. 37012285\r\n2. \t37013809\r\n3. \t37013868\r\n  1. Retard 1st Class  w/ Cluster & Ribbon\r\n2....           [1]    ['We should try and not 

In [54]:
def mark_text_labels(text_utterances_length, labels):
    if not labels:
        # return ['other'] * text_utterances_length
        return [0] * text_utterances_length
    new_labels = []
    int_list = ast.literal_eval(labels)
    for i in range(text_utterances_length):
        if i+1 in int_list:
            # new_labels.append('hate_speech')
            new_labels.append(1)
        else:
            # new_labels.append('other')
            new_labels.append(0)
    return new_labels

Splitting 'text' and 'response' into individual rows, so that I can construct a graph from it

In [55]:
text_column = []
text_labels_column = []
response_column = []
response_labels_column = []

for index, row in content_gab.iterrows():
    text_utterances = row['text'].split('\n')
    text_utterances = list(filter(None, text_utterances))

    for i, t in enumerate(text_utterances):
        text_utterances[i] = clean_special_chars(t)
    text_labels = mark_text_labels(len(text_utterances), row['hate_speech_idx'])

    response_utterances = ast.literal_eval(row['response']) if row['response'] else []
    for i, r in enumerate(response_utterances):
        response_utterances[i] = clean_special_chars(r)
    # response_labels = ['other'] * len(response_utterances)  
    response_labels = [0] * len(response_utterances)  

    
    text_column.append(text_utterances)
    text_labels_column.append(text_labels)
    response_column.append(response_utterances)
    response_labels_column.append(response_labels)

content_gab['text'] = text_column
content_gab['hate_speech_idx'] = text_labels_column
content_gab['response'] = response_column
content_gab['response_labels'] = response_labels_column

content_gab = content_gab.rename(columns={'hate_speech_idx': 'text_labels'})
print(content_gab.head())
print('- - - - ')
print(content_gab.columns)

for index, row in content_gab.iterrows():
    if index == 1:
        continue
    print(row['id'])
    print(row['text'])
    print(row['text_labels'])
    print(row['response'])
    print(row['response_labels'])
    break

                          id                                                text                        text_labels                      response                       extracted_id response_labels
0                                    1. 39869714\r\n  [1. i joined gab to remind myself how retarded...         [1]  [Using words that insult one group while defen...    39869714       [0, 0, 0]  
1  1. 39845588\r\n2. \t39848775\r\n3. \t\t3991101...  [1. This is what the left is really scared of....   [0, 0, 1]  [You can disagree with someones opinion withou...    39845588       [0, 0, 0]  
2                   1. 37485560\r\n2. \t37528625\r\n  [1. It makes you an asshole., 2. Give it to a ...      [0, 1]  [Your argument is more rational if you leave y...    37485560       [0, 0, 0]  
3                   1. 39787626\r\n2. \t39794481\r\n  [1. So they manage to provide a whole lot of d...      [0, 1]  [You shouldn't generalize a specific group or ...    39787626       [0, 0, 0]  
4  1. 37957930\

Encoding the labels

In [56]:
# label_encoder = LabelEncoder()
# content_gab['all_labels'] = content_gab['text_labels'] + content_gab['response_labels']
# content_gab['all_labels_encoded'] = content_gab['all_labels'].apply(label_encoder.fit_transform)
# print(content_gab.iloc[0])
# content_gab['text_labels_encoded'] = content_gab['text_labels'].apply(label_encoder.fit_transform)
# content_gab['response_labels_encoded'] = content_gab['response_labels'].apply(label_encoder.fit_transform)

Creating BERT encoding method

In [57]:
bert = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(sentences):
    if isinstance(sentences, list):
        return bert.encode(sentences, show_progress_bar=True).tolist()
    elif isinstance(sentences, str):
        return bert.encode([sentences], show_progress_bar=True).tolist()
    return []

Generating BERT embeddings

In [58]:
content_gab = content_gab[:200]
before = time.time()
content_gab['text_embeddings'] = content_gab['text'].apply(generate_embeddings)
after_text = time.time()
print(content_gab.iloc[1]['text_embeddings'])
print('\nTIME FOR TEXT EMBEDDINGS: ', after_text - before)
print('\n- - - - - -\n')
content_gab['response_embeddings'] = content_gab['response'].apply(generate_embeddings)
after_response = time.time()
print(content_gab.iloc[2]['response_embeddings'])
print('\nTIME FOR RESPONSE EMBEDDINGS: ', after_response - after_text)
print('\n- - - - - -\n')

Batches: 100%|██████████| 1/1 [00:00<00:00, 53.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 63.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 81.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 132.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 114.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 931.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 102.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 78.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 78.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 398.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 71.04it/s]
Batch

[[-0.023572057485580444, 0.01794440858066082, 0.0405656173825264, 0.06729649752378464, 0.09804009646177292, 0.035114046186208725, 0.06723955273628235, -0.07981331646442413, 0.012592255137860775, -0.06395190209150314, 0.014613417908549309, -0.028686098754405975, 0.06557455658912659, -0.05138685926795006, -0.1029239371418953, 0.015551798976957798, -0.06676264107227325, -0.0029045091941952705, -0.027871334925293922, 0.0603627972304821, -0.027235597372055054, 0.02632732316851616, 0.03128805756568909, 0.017424048855900764, 0.013893608935177326, -0.06276204437017441, -0.013789285905659199, -0.01572624407708645, -0.03531145676970482, -0.05476396158337593, 0.013463081791996956, -0.028276406228542328, -0.03120226040482521, -0.054336175322532654, -0.011610085144639015, -0.04129831865429878, 0.10698013752698898, -0.050249602645635605, -0.02981388196349144, 0.06209121271967888, -0.017547253519296646, -0.01501332875341177, 0.08790478855371475, 0.07822359353303909, -0.09159855544567108, 0.0481598377

Batches: 100%|██████████| 1/1 [00:00<00:00, 72.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 658.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 69.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 67.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 106.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 634.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 155.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 80.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 84.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 72.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 80.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 72.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 92.90it/s]
Batche

[[0.09269341826438904, 0.0053440057672560215, -0.021850084885954857, -0.0007701154099777341, 0.04873264953494072, 0.028574705123901367, 0.06419847160577774, -0.009560780599713326, 0.053361181169748306, -0.0879056453704834, -0.014494847506284714, -0.0013367488281801343, 0.0982813909649849, -0.013169684447348118, 0.03653031960129738, 0.08743903785943985, 0.05540039762854576, 0.001535855932161212, -0.03637176379561424, 0.00894644483923912, -0.08317050337791443, 0.09316875785589218, -0.046798184514045715, -0.020233850926160812, -0.0499594546854496, -0.0417790450155735, -0.015589868649840355, 0.033974144607782364, -0.014390970580279827, 0.1086299866437912, 0.020122447982430458, -0.013854089193046093, 0.04655636101961136, 0.014802754856646061, 0.0009073888068087399, -0.0037359357811510563, 0.00721629848703742, 0.047422803938388824, 0.015439050272107124, 0.013636752963066101, 0.012506725266575813, 0.008980403654277325, -0.002120441058650613, -0.07505074888467789, -0.0013401504838839173, -0.00




Method for constructing graphs

In [66]:
def construct_graph(row):
    text_utterances = row['text_embeddings']
    response_utterances = row['response_embeddings']
    # text_utterances = row['text']
    # response_utterances = row['response']

    root = text_utterances[0]
    children = text_utterances[1:] + response_utterances
    num_nodes = len(children) +1

    #for t in text_utterances:
     #    print(t)
    #print()
    #for r in response_utterances:
    #     print(r)
    # print()
    # ids = [[0, i] for i in range(1, num_nodes)]
    # print(ids)
    # edge_index = torch.tensor(
    #     [[0]*num_nodes, list(range(1, num_nodes)
    # )], dtype=torch.long)
    edge_index = torch.tensor(
        [[0, i] for i in range(1, num_nodes)], dtype=torch.long
    ).t().contiguous()
    # edge_index = torch.tensor(
    #     [[0] * len(children), list(range(1, num_nodes))], dtype=torch.long
    # )
    

    # print(row['text_labels_encoded'])
    # print()
    # print(row['response_labels_encoded'])
    # print(type(row['text_labels_encoded']), row['text_labels_encoded'].shape, row['text_labels_encoded'])
    # print(type(row['response_labels_encoded']), row['response_labels_encoded'].shape, row['response_labels_encoded'])

    # ls = np.concatenate((row['text_labels_encoded'], row['response_labels_encoded']))
    ls = np.concatenate((row['text_labels'], row['response_labels'])).astype(int)
    
    print(ls)

    print(ls.shape)
    print(type(ls))
    print(type(ls[0]))
    
    labels = torch.tensor(ls, dtype=torch.int32)

    print(labels)

    node_features = torch.tensor([root] + children, dtype=torch.float)
    
    # print(node_features.shape)
    # print(edge_index.shape)
    # print(labels.shape)
    # print('sss')
    data = Data(x=node_features, edge_index=edge_index, y=labels)
    return data

Constructing graphs for all rows

In [None]:
graphs = []
for index, row in content_gab.iterrows():
    graphs.append(construct_graph(row))

print(graphs[0])
print('\n- - - - - -\n')
print(f"Number of nodes: {graphs[0].num_nodes}")
print(f"Number of edges: {graphs[0].num_edges}")

print()
print(len(graphs))
#print('Graphs: ')
#for i in range(0, 100):
#    print(graphs[i])

[1 0 0 0]
(4,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([1, 0, 0, 0], dtype=torch.int32)
[0 0 1 0 0 0]
(6,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 0, 1, 0, 0, 0], dtype=torch.int32)
[0 1 0 0 0]
(5,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 1, 0, 0, 0], dtype=torch.int32)
[0 1 0 0 0]
(5,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 1, 0, 0, 0], dtype=torch.int32)
[0 0 1 0 0 0]
(6,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 0, 1, 0, 0, 0], dtype=torch.int32)
[1 0 0 0]
(4,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([1, 0, 0, 0], dtype=torch.int32)
[0 0 1 0 0 0]
(6,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 0, 1, 0, 0, 0], dtype=torch.int32)
[0 1 0 0]
(4,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([0, 1, 0, 0], dtype=torch.int32)
[1 0 1 0 0 0]
(6,)
<class 'numpy.ndarray'>
<class 'numpy.int32'>
tensor([1, 0, 1, 0, 0, 0], dtype=torch.int32)
[0 0 1 0 0 0]
(6,)
<class 'numpy.ndarray'

Graphs from words

In [64]:
for texts in content_gab['text']:
    for sentence in texts:
        words = sentence.split()
        words.pop(0)
        print(words)


['i', 'joined', 'gab', 'to', 'remind', 'myself', 'how', 'retarded', 'jew', 'haters', 'are.', 'You', "wouldn't", 'be', 'typing', 'on', 'your', 'abacus', 'without', 'them', 'you', 'retard.']
['This', 'is', 'what', 'the', 'left', 'is', 'really', 'scared', 'of.', 'https://redd.it/9rfkts']
['That', 'literally', 'looks', 'like', 'a', 'monkey.', 'Why', 'are', 'we', 'supposed', 'to', 'pretend', 'it’s', 'a', 'person', 'bc', 'it’s', 'wearing', 'a', 'red', 'hat?']
['Dumb', 'Cunt']
['It', 'makes', 'you', 'an', 'asshole.']
['Give', 'it', 'to', 'a', 'soldier', 'who', 'has', 'defend', 'it', ',', 'then', 'try', 'to', 'burn', 'it', '.', 'Show', 'me', 'what', 'a', 'badass', 'you', 'really', 'are.', 'The', 'flag', 'is', 'helpless', 'to', 'stop', 'you', 'or', 'fight', 'back', '.', 'Completely', 'unnecessary', 'act', ',', 'its', 'proves', 'nothing', 'and', 'accomplishes', 'nothing.', 'It', 'only', 'shows', 'your', 'a', 'ignorant', 'weak', 'faggot', '.', 'Like', 'kicking', 'a', 'puppy', ',,,', 'it', 'shows'

In [78]:
import pandas as pd
from torch_geometric.data import Data


# Function to construct a graph for a single sentence
def construct_graph_words(sentence, label):
    # Split sentence into words and remove the first word
    words = sentence.split()[1:]

    # Create nodes (word embeddings, or simple indices for now)
    nodes = list(range(len(words)))

    # Create edges (sequential connections: i -> i+1)
    edges = [(i, i+1) for i in range(len(words)-1)]  # Directed edges
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    # Convert nodes to tensor
    x = torch.tensor(nodes, dtype=torch.float).view(-1, 1)  # Example: 1D feature per node

    # Create the graph
    graph = Data(x=x, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long))
    print(graph)
    return graph

# Process each sentence in the DataFrame
graphs_words = []
for index, row in content_gab.iterrows():
    for sentence, label in zip(row['text'], row['text_labels']):
        graph = construct_graph_words(sentence, label)
        graphs_words.append(graph)



Data(x=[22, 1], edge_index=[2, 21], y=[1])
Data(x=[10, 1], edge_index=[2, 9], y=[1])
Data(x=[21, 1], edge_index=[2, 20], y=[1])
Data(x=[2, 1], edge_index=[2, 1], y=[1])
Data(x=[5, 1], edge_index=[2, 4], y=[1])
Data(x=[67, 1], edge_index=[2, 66], y=[1])
Data(x=[39, 1], edge_index=[2, 38], y=[1])
Data(x=[29, 1], edge_index=[2, 28], y=[1])
Data(x=[26, 1], edge_index=[2, 25], y=[1])
Data(x=[2, 1], edge_index=[2, 1], y=[1])
Data(x=[5, 1], edge_index=[2, 4], y=[1])
Data(x=[5, 1], edge_index=[2, 4], y=[1])
Data(x=[14, 1], edge_index=[2, 13], y=[1])
Data(x=[8, 1], edge_index=[2, 7], y=[1])
Data(x=[7, 1], edge_index=[2, 6], y=[1])
Data(x=[11, 1], edge_index=[2, 10], y=[1])
Data(x=[44, 1], edge_index=[2, 43], y=[1])
Data(x=[5, 1], edge_index=[2, 4], y=[1])
Data(x=[10, 1], edge_index=[2, 9], y=[1])
Data(x=[32, 1], edge_index=[2, 31], y=[1])
Data(x=[37, 1], edge_index=[2, 36], y=[1])
Data(x=[15, 1], edge_index=[2, 14], y=[1])
Data(x=[68, 1], edge_index=[2, 67], y=[1])
Data(x=[9, 1], edge_index=[2,

Merge to one graph

In [14]:

# Initialize empty lists to store the merged node features, edge indices, and labels (y)
merged_x = []
merged_edge_index = []
merged_y = []

# Keep track of the offset for node indices in subsequent graphs
node_offset = 0

# Iterate over each graph in the list
for graph in graphs:
    # Concatenate node features
    merged_x.append(graph.x)
    
    # Adjust edge indices: add the current node_offset to the second row of edge_index
    merged_edge_index.append(graph.edge_index + node_offset)
    
    # Concatenate labels (y), the target labels from each graph
    merged_y.append(graph.y)
    
    # Update node_offset for the next graph
    node_offset += graph.x.size(0)

# Concatenate all node features, edge indices, and labels
merged_x = torch.cat(merged_x, dim=0)
merged_edge_index = torch.cat(merged_edge_index, dim=1)
merged_y = torch.cat(merged_y, dim=0)

# Create a new graph with merged node features, edge indices, and labels (y)
merged_graph = Data(x=merged_x, edge_index=merged_edge_index, y=merged_y)

print(merged_graph)
# Print the merged graph details
print("Merged Node Features:")
print(merged_graph.x)
print("Merged Edge Index:")
print(merged_graph.edge_index)


Data(x=[1198, 384], edge_index=[2, 998], y=[1198])
Merged Node Features:
tensor([[ 0.0813, -0.0214, -0.0582,  ...,  0.0492,  0.0206,  0.0012],
        [-0.0027,  0.0063, -0.0170,  ..., -0.0939, -0.0490,  0.0221],
        [ 0.0185, -0.0804,  0.0782,  ..., -0.0679, -0.0118,  0.0497],
        ...,
        [ 0.0485,  0.0435, -0.0563,  ..., -0.0356,  0.0521, -0.0419],
        [ 0.0144,  0.0081, -0.0431,  ..., -0.0029,  0.0659, -0.0579],
        [ 0.1278,  0.0149, -0.0109,  ...,  0.0264,  0.0013, -0.0679]])
Merged Edge Index:
tensor([[   0,    0,    0,  ..., 1193, 1193, 1193],
        [   1,    2,    3,  ..., 1195, 1196, 1197]])


Node level to graph level labels

In [15]:
# Example: Aggregating node-level labels into graph-level labels
#def convert_to_graph_level(dataset):
 #   new_dataset = []
 #   for data in dataset:
 #       # Example: Majority vote for classification
  #      #graph_label = data.y.mode()[0]  # Use the most frequent label
   #     graph_label = 1 if (data.y == 1).sum().item() > 0 else 0
    #    #data.y = graph_label.unsqueeze(0)  # Ensure shape [1]
     #   data.y = graph_label
      #  new_dataset.append(data)
    #return new_dataset

# Convert dataset
#new_dataset = convert_to_graph_level(graphs)
#for i in new_dataset:
 #   print(i.y)

Given percentage of shuffled dataset is test fold

In [79]:
random.shuffle(graphs_words)

size_train = len(graphs_words) - len(graphs_words) // 10 # 10% test dataset

train_dataset = graphs_words[:size_train]
test_dataset = graphs_words[size_train:]
print(len(train_dataset))
print(len(test_dataset))

517
57


Cross-validation 

In [80]:
#y = []
#for index, row in content_gab.iterrows():
#    y.append(np.concatenate((row['text_labels'], row['response_labels'])).astype(int))

#for i, q in enumerate(y):
#    print(q)
#    if i >= 5:
#       print('\n- - - -')
#       break
#for i, q in enumerate(graphs):
#    print(q)
#    if i >= 5:
#       break

#rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=36851234)
#folds = list(rskf.split(graphs, y))

Mini-batching of graphs

In [81]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(x=[1547, 1], edge_index=[2, 1483], y=[64], batch=[1547], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(x=[1730, 1], edge_index=[2, 1666], y=[64], batch=[1730], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(x=[1509, 1], edge_index=[2, 1449], y=[64], batch=[1509], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(x=[1510, 1], edge_index=[2, 1450], y=[64], batch=[1510], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(x=[1256, 1], edge_index=[2, 1195], y=[64], batch=[1256], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(x=[1662, 1], edge_index=[2, 1599], y=[64], batch=[1662], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(x=[1533, 1], edge_index=[2, 1470], y=[64], batch=[1533], ptr=[65])

Step 8:
Number of graphs in the current batch: 64
DataBatch(x=[1691, 1], edge_index=[2, 1628], y=[64], b

Graphs from words classification

In [84]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

num_classes = 2
input_dim = graphs_words[0].x.shape[1]

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(input_dim, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [88]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
            
            # Check the output shape and target shape:
        
         print(f"Model output shape: {out.shape}")
         
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 171):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([5, 2])
Epoch: 001, Train Acc: 0.5803, Test Acc: 0.5263
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([5, 2])
Epoch: 002, Train Acc: 0.6093, Test Acc: 0.5965
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.Size([64, 2])
Model output shape: torch.

ValueError: Expected input batch_size (63) to match target batch_size (64).

Node classification

In [66]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
input_dim = merged_graph.x.shape[1]    # embedding dimensionality
data = merged_graph

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(input_dim, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(384, 16)
  (conv2): GCNConv(16, 2)
)


In [67]:

def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

model = GCN(hidden_channels=16)
model.eval()

out = model(data.x, data.edge_index)
#visualize(out, color=data.y)

In [68]:
from torch_geometric.data import DataLoader

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

data = merged_graph

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out, data.y.long())  # Compute the loss for the training dataset.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred == data.y.long()  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / len(data.y)  # Derive ratio of correct predictions.
    return test_acc

for epoch in range(1, 101):
    loss = train()  # Pass the training dataset to train function.
    test_acc = test()  # Pass the test dataset to test function.
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {test_acc:.4f}')


Epoch: 001, Loss: 0.6894, Test Accuracy: 0.7922
Epoch: 002, Loss: 0.6319, Test Accuracy: 0.7922
Epoch: 003, Loss: 0.5803, Test Accuracy: 0.7922
Epoch: 004, Loss: 0.5405, Test Accuracy: 0.7922
Epoch: 005, Loss: 0.5318, Test Accuracy: 0.7922
Epoch: 006, Loss: 0.5238, Test Accuracy: 0.7922
Epoch: 007, Loss: 0.5417, Test Accuracy: 0.7922
Epoch: 008, Loss: 0.5459, Test Accuracy: 0.7922
Epoch: 009, Loss: 0.5421, Test Accuracy: 0.7922
Epoch: 010, Loss: 0.5371, Test Accuracy: 0.7922
Epoch: 011, Loss: 0.5251, Test Accuracy: 0.7922
Epoch: 012, Loss: 0.5132, Test Accuracy: 0.7922
Epoch: 013, Loss: 0.5104, Test Accuracy: 0.7922
Epoch: 014, Loss: 0.5037, Test Accuracy: 0.7922
Epoch: 015, Loss: 0.4979, Test Accuracy: 0.7922
Epoch: 016, Loss: 0.5026, Test Accuracy: 0.7922
Epoch: 017, Loss: 0.4963, Test Accuracy: 0.7922
Epoch: 018, Loss: 0.5015, Test Accuracy: 0.7922
Epoch: 019, Loss: 0.4988, Test Accuracy: 0.7922
Epoch: 020, Loss: 0.4973, Test Accuracy: 0.7922
Epoch: 021, Loss: 0.5009, Test Accuracy:

In [69]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7922
