### Extract Kialo Debate Data

Parses raw Kialo text files to extract structured debate data: discussion names, arguments (Pro/Con), sources, and hierarchical node IDs (e.g., `1.2.3`).

In [31]:
from langdetect import detect

def is_english_text(text: str) -> bool:
    try:
        return detect(text) == "en"
    except:
        return False

In [None]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def parse_kialo_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [l.rstrip("\n") for l in f]

    discussion_name = None
    rows = []
    sources = {}
    in_sources = False

    # First pass: collect sources
    for i, line in enumerate(lines):
        if line.strip().startswith("Sources:"):
            in_sources = True
            continue
        if in_sources:
            match = re.match(r"\[(\d+)\]\s+(\S+)", line)
            if match:
                idx, url = match.groups()
                sources[idx] = url

    # Second pass: parse arguments
    for i, line in enumerate(lines):
        if line.startswith("Discussion Name:"):
            discussion_name = lines[i+1].strip()
        match = re.match(r"\s*(\d+(?:\.\d+)*)\.\s*(Pro|Con):\s*$", line)
        if match:
            node_id, stance = match.groups()
            j = i + 1
            while j < len(lines) and lines[j].strip() == "":
                j += 1
            argument = lines[j].strip() if j < len(lines) else ""
            refs = re.findall(r"\[(\d+)\]", argument)
            urls = [sources[r] for r in refs if r in sources]
            clean_argument = re.sub(r"\[\d+\]", "", argument).strip()
            rows.append({
                "discussion": discussion_name,
                "node_id": node_id,
                "stance": stance,
                "argument": clean_argument,
                "sources": urls
            })
    return pd.DataFrame(rows)

data_dir = "../../data/kialo/debates"
all_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".txt")]

dfs = []
for f in all_files:
    filename = os.path.basename(f)
    if not is_english_text(filename):
        print(f"Skipping non-English file: {filename}")
        continue
    try:
        dfs.append(parse_kialo_txt(f))
    except Exception as e:
        print(f"Error with {f}: {e}")

full_df = pd.concat(dfs, ignore_index=True)
print("Total number of arguments:", len(full_df))
full_df

[nltk_data] Downloading package stopwords to /home/nysek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Skipping non-English file: does-god-exist-2629 (1).txt
Skipping non-English file: is-an-omnimalevolent-god-as-plausible-as-an-omnibenevolent-god-51497.txt
Skipping non-English file: are-ufos-real-64046.txt
Skipping non-English file: warum-gibt-es-leid-auf-der-welt-wenn-es-einen-liebenden-allmächtigen-gott-gibt-43228.txt
Skipping non-English file: are-women-better-than-men-26866.txt
Skipping non-English file: le-macchine-possono-pensare-42870.txt
Skipping non-English file: do-gun-control-laws-reduce-crime-8319.txt
Skipping non-English file: quale-partito-dovrebbero-votare-gli-italiani-alle-prossime-elezioni-politiche-9131.txt
Skipping non-English file: does-religion-misguide-people-31753.txt
Skipping non-English file: la-eutanasia-podria-ser-viable-en-españa-52324 (1).txt
Skipping non-English file: are-video-games-art-27596.txt
Skipping non-English file: religia-nie-powinna-być-przedmiotem-szkolnym-32840 (1).txt
Skipping non-English file: was-sind-die-effektivsten-aktionsformen-gegen-re

Unnamed: 0,discussion,node_id,stance,argument,sources
0,"""Collective Unconscious"" is real",1.1,Con,Archetypes is a human word describing a widely...,[]
1,"""Collective Unconscious"" is real",1.1.1,Pro,"Humans, being alike from any animal, have comp...",[]
2,"""Collective Unconscious"" is real",1.2,Con,There are a wide range that people are exposed...,[]
3,"""Collective Unconscious"" is real",1.3,Pro,Collective Intelligence is real and it is test...,[]
4,"""Collective Unconscious"" is real",1.4,Pro,The similarity between all religions prove tha...,[]
...,...,...,...,...,...
205667,Would A Cashless Society Be Better?,1.11.3,Pro,People lose money. The amount of lost money un...,[]
205668,Would A Cashless Society Be Better?,1.11.4,Pro,Service and item bartering would regain popula...,[]
205669,Would A Cashless Society Be Better?,1.12,Con,Cash is important for the the grey economy or ...,[]
205670,Would A Cashless Society Be Better?,1.12.1,Pro,"Without an anonymous means to transact, no cur...",[]


We can see that some english debates are detected as non english, but we already have a lot of data, so it's not a big problem.

### Create Parent-Child Pairs

Generates pairs of connected arguments (parent -> child) and labels their relationship as **Support** (same stance) or **Attack** (opposing stances).

In [33]:
def create_parent_child_pairs(df):
    """
    Creates parent/child pairs from Kialo data
    """
    pairs = []
    
    # Group by discussion to process each debate separately
    for discussion_name, group in df.groupby('discussion'):
        # Create dictionary for quick access by node_id
        nodes = {row['node_id']: row for _, row in group.iterrows()}
        
        # For each node, find its parent
        for _, child_row in group.iterrows():
            child_id = child_row['node_id']
            parent_id = find_parent_id(child_id)
            
            # If we find a parent in the data
            if parent_id and parent_id in nodes:
                parent_row = nodes[parent_id]
                
                # Determine relation based on stances
                relation = determine_relation(parent_row['stance'], child_row['stance'])
                
                pairs.append({
                    'discussion': discussion_name,
                    'parent_node_id': parent_id,
                    'parent_argument': parent_row['argument'],
                    'parent_stance': parent_row['stance'],
                    'child_node_id': child_id,
                    'child_argument': child_row['argument'],
                    'child_stance': child_row['stance'],
                    'relation': relation
                })
    
    return pd.DataFrame(pairs)

def find_parent_id(node_id):
    """
    Finds the parent ID of a given node
    Ex: 1.2.3 -> 1.2, 1.5 -> 1, 2.1.4.2 -> 2.1.4
    """
    parts = node_id.split('.')
    if len(parts) <= 1:
        return None  # No parent (root node)
    
    # Return parent ID (all parts except the last one)
    return '.'.join(parts[:-1])

def determine_relation(parent_stance, child_stance):
    """
    Determines if the relation is Support or Attack based on stances
    
    In Kialo:
    - If parent and child have same stance -> Support
    - If parent and child have different stances -> Attack
    """
    if parent_stance == 'Pro':
        if child_stance == 'Pro':
            return 'Support'
        else:
            return 'Attack'
    if parent_stance == 'Con':
        if child_stance == 'Pro':
            return 'Support'
        else:
            return 'Attack'

# Usage with your data
pairs_df = create_parent_child_pairs(full_df)

print(f"Total number of pairs created: {len(pairs_df)}")

# Relation verification
print(f"\nRelation distribution:")
print(pairs_df['relation'].value_counts())

# Example of Support relation
print(f"\nExample of Support relation:")
support_example = pairs_df[pairs_df['relation'] == 'Support'].iloc[0] if len(pairs_df[pairs_df['relation'] == 'Support']) > 0 else None
if support_example is not None:
    print(f"Parent ({support_example['parent_stance']}): {support_example['parent_argument'][:100]}...")
    print(f"Child ({support_example['child_stance']}): {support_example['child_argument'][:100]}...")

# Example of Attack relation
print(f"\nExample of Attack relation:")
attack_example = pairs_df[pairs_df['relation'] == 'Attack'].iloc[0] if len(pairs_df[pairs_df['relation'] == 'Attack']) > 0 else None
if attack_example is not None:
    print(f"Parent ({attack_example['parent_stance']}): {attack_example['parent_argument'][:100]}...")
    print(f"Child ({attack_example['child_stance']}): {attack_example['child_argument'][:100]}...")

pairs_df

Total number of pairs created: 197831

Relation distribution:
relation
Attack     100470
Support     97361
Name: count, dtype: int64

Example of Support relation:
Parent (Con): The pharmacy chain is convoluted and deeply resistant to regulation through lobbying efforts...
Child (Pro): Spending on pharmaceutical industry lobbying reached a record amount in 2020, at more than $306 mill...

Example of Attack relation:
Parent (Pro): These manufacturers invest billions into R&D that contributes toward new therapies every year that i...
Child (Con): A substantial portion of Research and Development (R&D) is financed through government grants, yet t...


Unnamed: 0,discussion,parent_node_id,parent_argument,parent_stance,child_node_id,child_argument,child_stance,relation
0,"""Big pharma"" is actually good for society",1.1,The pharmacy chain is convoluted and deeply re...,Con,1.1.1,Spending on pharmaceutical industry lobbying r...,Pro,Support
1,"""Big pharma"" is actually good for society",1.1,The pharmacy chain is convoluted and deeply re...,Con,1.1.2,"In 2020 there were 1,500 pharmaceutical lobbyi...",Pro,Support
2,"""Big pharma"" is actually good for society",1.3,These manufacturers invest billions into R&D t...,Pro,1.3.1,Diseases from hemophilia to cancer to autoimmu...,Pro,Support
3,"""Big pharma"" is actually good for society",1.3,These manufacturers invest billions into R&D t...,Pro,1.3.2,Likely the most efficient way to motivate cont...,Pro,Support
4,"""Big pharma"" is actually good for society",1.3,These manufacturers invest billions into R&D t...,Pro,1.3.3,A substantial portion of Research and Developm...,Con,Attack
...,...,...,...,...,...,...,...,...
197826,Would the world be a better place without humans?,1.4.4,"According to the word of God in Genesis 1 , hu...",Con,1.4.4.2,The Bible is not a credible source of informat...,Con,Attack
197827,Would the world be a better place without humans?,1.4.4.2,The Bible is not a credible source of informat...,Con,1.4.4.2.1,Due to the age of the Bible and the fact that ...,Pro,Support
197828,Would the world be a better place without humans?,1.4.4.2,The Bible is not a credible source of informat...,Con,1.4.4.2.2,"The Bible was written by humans , not a divine...",Pro,Support
197829,Would the world be a better place without humans?,1.4.4.2,The Bible is not a credible source of informat...,Con,1.4.4.2.3,Many of the kingdoms and cities in the Old Te...,Con,Attack


### Simplify DataFrame

Removes unnecessary columns (e.g., discussion names, node IDs) to keep only `parent_argument`, `child_argument`, and `relation`.

In [34]:
pairs_df.drop(columns=['discussion', 'parent_node_id', 'parent_stance', 'child_node_id', 'child_stance'], inplace=True)
pairs_df

Unnamed: 0,parent_argument,child_argument,relation
0,The pharmacy chain is convoluted and deeply re...,Spending on pharmaceutical industry lobbying r...,Support
1,The pharmacy chain is convoluted and deeply re...,"In 2020 there were 1,500 pharmaceutical lobbyi...",Support
2,These manufacturers invest billions into R&D t...,Diseases from hemophilia to cancer to autoimmu...,Support
3,These manufacturers invest billions into R&D t...,Likely the most efficient way to motivate cont...,Support
4,These manufacturers invest billions into R&D t...,A substantial portion of Research and Developm...,Attack
...,...,...,...
197826,"According to the word of God in Genesis 1 , hu...",The Bible is not a credible source of informat...,Attack
197827,The Bible is not a credible source of informat...,Due to the age of the Bible and the fact that ...,Support
197828,The Bible is not a credible source of informat...,"The Bible was written by humans , not a divine...",Support
197829,The Bible is not a credible source of informat...,Many of the kingdoms and cities in the Old Te...,Attack


### Preprocess Argument Texts

Cleans text (lowercase, removes noise), balances the dataset (undersamples majority class), and filters out short/empty arguments.

In [35]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from transformers import BertTokenizer

MAX_TOKENS = 512

class ArgumentDataProcessor:
    """
    Class for preprocessing argumentative data
    """
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def clean_text(self, text):
        """
        - Improve the quality of embeddings
        - Reduce noise and improve generalization
        - Standardize input format
        """
        if pd.isna(text):
            return ""
        
        # Lowercase conversion
        text = text.lower()
        
        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s\.\!\?\,\;\:\']', ' ', text)
        
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def truncate_text(self, text, column_name=None):
        """
        Truncate text to MAX_TOKENS tokens for BERT
        Returns the truncated text and logs if truncation happened
        """
        tokens = self.tokenizer.encode(text, add_special_tokens=True)
        if len(tokens) > MAX_TOKENS:
            if column_name:
                print(f"Warning: {column_name} text truncated to {MAX_TOKENS} tokens")
            tokens = tokens[:MAX_TOKENS]
            return self.tokenizer.decode(tokens, skip_special_tokens=True)
        return text
    
    def explore_data(self, df):
        """
        - Understand the class distribution (Support vs Attack balance)
        - Identify outliers (claims that are too short/long)
        - Detect patterns in the dataset
        """
        print("=== DATA EXPLORATION ===")
        print(f"Dataset size: {len(df)}")
        print(f"Columns: {df.columns.tolist()}")
        
        # Relation distribution
        print("\nRelation distribution:")
        relation_counts = df['relation'].value_counts()
        print(relation_counts)
        print(f"Balance ratio: {relation_counts.min() / relation_counts.max():.2f}")
        
        # Argument length statistics
        df['parent_length'] = df['parent_argument'].str.len()
        df['child_length'] = df['child_argument'].str.len()
        
        print(f"\nAverage parent length: {df['parent_length'].mean():.1f}")
        print(f"Average child length: {df['child_length'].mean():.1f}")
        
        # Missing values check
        print("\nMissing values:")
        print(df.isnull().sum())
        
        return df
    
    def process_dataframe(self, df, balance=True, strategy="undersample"):
        print("=== PREPROCESSING ===")
        
        original_size = len(df)

        # Drop duplicates
        df = df.drop_duplicates(subset=['parent_argument','child_argument','relation'])
        dropped = original_size - len(df)
        print(f"Duplicate rows removed: {dropped}")
        original_size = len(df)

        # Drop rows with missing values
        df = df.dropna(subset=['parent_argument','child_argument','relation'])
        dropped = original_size - len(df)
        print(f"Rows with missing values removed: {dropped}")
        original_size = len(df)

        # Clean text
        df['parent_clean'] = df['parent_argument'].apply(self.clean_text)
        df['child_clean'] = df['child_argument'].apply(self.clean_text)

        # Cutting out texts that are too long for BERT
        df['parent_clean'] = df['parent_clean'].apply(lambda x: processor.truncate_text(x, 'parent'))
        df['child_clean'] = df['child_clean'].apply(lambda x: processor.truncate_text(x, 'child'))

        # Drop rows with empty cleaned text
        df = df[(df['parent_clean'].str.len()>0) & (df['child_clean'].str.len()>0)]
        dropped = original_size - len(df)
        print(f"Rows with empty cleaned text removed: {dropped}")
        original_size = len(df)

        # Delete rows with less than 50 characters
        df = df[(df['parent_argument'].str.len() >= 50) & (df['child_argument'].str.len() >= 50)]
        dropped = original_size - len(df)
        print(f"Rows with less than 50 characters removed: {dropped}")
        original_size = len(df)

        # Optional balancing
        if balance:
            counts = df['relation'].value_counts()
            print("Before balancing:", counts.to_dict())
            min_size = counts.min()
            dfs = []
            for label in counts.index:
                dfs.append(resample(
                    df[df['relation']==label],
                    replace=False if strategy=="undersample" else True,
                    n_samples=min_size if strategy=="undersample" else counts.max(),
                    random_state=42
                ))
            df = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)
            print("After balancing:", df['relation'].value_counts().to_dict())

        print(f"Final dataset size: {len(df)}")
        return df

processor = ArgumentDataProcessor()
clean_df = processor.explore_data(pairs_df)
clean_df = processor.process_dataframe(pairs_df)



=== DATA EXPLORATION ===
Dataset size: 197831
Columns: ['parent_argument', 'child_argument', 'relation']

Relation distribution:
relation
Attack     100470
Support     97361
Name: count, dtype: int64
Balance ratio: 0.97

Average parent length: 134.1
Average child length: 145.8

Missing values:
parent_argument    0
child_argument     0
relation           0
parent_length      0
child_length       0
dtype: int64
=== PREPROCESSING ===
Duplicate rows removed: 79612
Rows with missing values removed: 0
Rows with empty cleaned text removed: 0
Rows with less than 50 characters removed: 14594
Before balancing: {'Attack': 52744, 'Support': 50881}
After balancing: {'Attack': 50881, 'Support': 50881}
Final dataset size: 101762


### Inspect Random Examples

Prints 50 random argument pairs with their relationships to verify data quality and labeling.

In [36]:
final_df = clean_df.drop(columns=['parent_argument', 'child_argument', 'parent_length', 'child_length'])

for i in range (0, 50):
    rand_index = final_df.sample(n=1, random_state=i).index[0]
    print(f"\nRandom example {i+1}:")
    print(final_df.iloc[rand_index]['parent_clean'])
    print(final_df.iloc[rand_index]['child_clean'])
    print(final_df.iloc[rand_index]['relation'])

final_df


Random example 1:
the apostles were clearly more significant than the rest of jesus's disciples.
only the writings of the apostles were considered worthy to include in the biblical canon.
Support

Random example 2:
though the bible states that natural disasters are a result of god's wrath against sinning , science has shown that the cause of natural disasters are set forces of nature that are difficult but not impossible to predict.
god, as a primary cause of events does not preclude natural secondary causes. the bible's recording of these events and god as primary cause is more about the author's interpretation of events and does not invalidate the historical veracity of the account.
Attack

Random example 3:
confucianism according to confucianism, the purpose of life is to get disciplined and educated .
instead of focusing on individual freedom, confucianism emphasizes self discipline, ritual propriety, and moral uprightness .
Support

Random example 4:
the lack of women in leadersh

Unnamed: 0,relation,parent_clean,child_clean
0,Attack,in a 2018 straw poll in washington's 10th cong...,if score voting were actually used for real wo...
1,Support,there are sufficient safeguards in place preve...,"the media, which is widely regarded as the fou..."
2,Support,"a recent change in us law means that, as of ju...",some us women are being discriminated against ...
3,Attack,marriage has been shown to reinforce the gende...,the correlation this study found might a littl...
4,Attack,human rights are not laws of the universe but ...,"we only experience human consciousness, but we..."
...,...,...,...
101757,Attack,personal health records are private informatio...,patient information can be used by authorized ...
101758,Support,employers are likely to become better at deali...,if people are able to be open about a physical...
101759,Support,there is no such universal and objective right...,you could claim that human rights are neither ...
101760,Attack,"in some nations, private school fees enjoy cha...",it is unlikely that the taxes gained by removi...


### Save Final Dataset

Exports the cleaned DataFrame to a CSV file for embedding generation and model training.

In [None]:
from sklearn.model_selection import train_test_split

def make_balanced_sample(df, n, seed=42):
    # Stratified sampling sur la colonne 'relation'
    sample, _ = train_test_split(
        df,
        train_size=n,
        stratify=df['relation'],
        random_state=seed
    )
    return sample.reset_index(drop=True)

# 100k version
df_100k = make_balanced_sample(final_df, 50_000)
df_100k.to_csv('../../data/kialo/kialo-pairs-50k.csv', index=False)

# 50k version
df_50k = make_balanced_sample(final_df, 25_000)
df_50k.to_csv('../../data/kialo/kialo-pairs-25k.csv', index=False)

# 10k version
df_10k = make_balanced_sample(final_df, 10_000)
df_10k.to_csv('../../data/kialo/kialo-pairs-10k.csv', index=False)