In [47]:
import sys, time, json, re
from anytree import Node, RenderTree
from anytree.exporter import JsonExporter
from anytree import Node, RenderTree

sys.setrecursionlimit(100000)

In [48]:

def group_arguments(tableau):
    argGroup = tableau[0]
    i = 1

    while True:

        if i > len(tableau) - 1:
            return [argGroup]

        stance = re.search(r"(Con|Pro)(?::)", tableau[i])
        if stance == None:
            argGroup = argGroup + " " + tableau[i]
            i+=1
        else:
            return [argGroup] + group_arguments(tableau[i:]) 


In [49]:
def rawKialo2Json(input_file):
    with open(input_file, 'r') as fi:
        lines = []
        for line in fi:
            if line.startswith("Sources:"):
                break
            lines.append(line.strip())

        lines = [x for x in lines if x]

        # list containing each parsed comment
        result = []

        # we remove the first two lines of the text
        # as we don't need the header
        header = []
        for line in range(0, 4):
            header.append(lines.pop(0))

        subject = header[1]

        lines = group_arguments(lines)

        ##                                            ##
        ##                 REGEDITS                   ##
        ##                                            ##
        # iterate every row in the text file
        counter = 1
        for line in lines:

            # find the tree position the comment is in
            tree =  re.search(r"^(\d{1,}.)+", line)

            # find if the comment is Pro or Con
            stance = re.search(r"(Con|Pro)(?::)", line)

            # find the text of the comment
            content = re.search(r"((Con|Pro)(?::\s))(.*)", line)

            # define the hierarchy of the current comment
            # which is based on the tree structure

            parsed = re.findall(r"(\d{1,}(?=\.))+", tree.group())
            level = len(parsed)-1

            # make a dictionary with the single entry
            # and put it at the end of the list
            result.append({
                "Tree": tree.group(),
                "Level": level,
                "Stance": stance.group(1),
                "ToneInput": content.group(3),
                "node_id":subject.replace(" ","_")+"_"+str(counter)
            })

            counter+=1
        
        to_write = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))

    trees = [x["Tree"] for x in result]
    trees = ['1.'] + trees

    resultAsDict = { x["Tree"]: x for x in result }

    id2Node = {}


    for idNode in trees:
        if idNode == '1.':
            id2Node[idNode] = Node(idNode, node_id=-1)
        else:
            parentId = idNode[:idNode[:-1].rfind(".")+1]
            id2Node[idNode] = Node(idNode,
                                    parent=id2Node[parentId],
                                    tree=resultAsDict[idNode]["Tree"], 
                                    level=resultAsDict[idNode]["Level"], 
                                    stance=resultAsDict[idNode]["Stance"], 
                                    toneInput=resultAsDict[idNode]["ToneInput"], 
                                    subject=subject,
                                    node_id=resultAsDict[idNode]["node_id"]
    )

    return id2Node

In [50]:
def argumentTree2argumentPairTree(node, domains):    
    pairs = []
    
    if len(node.children) == 0:
        return pairs
    elif node.children != None:
        for child in node.children:
            if node.name != "1.":
                pair = {
                    "topArgument"       :   node.toneInput,
                    "subArgument"       :   child.toneInput,
                    "subject"           :   child.subject,
                    "subArgumentLevel"  :   child.level,
                    "domain"            :   domains
                }
                if child.stance == "Con":
                    pair["isAttack"] = True
                    pair["isSupport"] = False
                else:
                    pair["isAttack"] = False
                    pair["isSupport"] = True
                pairs.append(pair)

            pairs += argumentTree2argumentPairTree(child, domains)
        
    return pairs
    return pairs

In [51]:
import pandas as pd

kialoUrlIds = pd.read_csv("../../rawData/kialo/kialo-url-ids.csv", index_col=0)
pairs = []

for i, x in kialoUrlIds.iterrows():
  try:
    d = x.tags
    kialoUrlId = x.kialoUrlId

    # print(x)

    t = rawKialo2Json("../../rawData/kialo/debates/"+ kialoUrlId +".txt")
    pairs = pairs + argumentTree2argumentPairTree(t['1.'], d)
  except:
    continue

In [52]:
pairs

[{'topArgument': 'Purity pledges lead to people having a poorer understanding of sexuality.',
  'subArgument': 'Purity pledges lead to people not having a proper understanding of the potential consequences of sex.',
  'subject': 'Are Purity Pledges Harmful?',
  'subArgumentLevel': 2,
  'domain': "['Purity', 'Sex', 'Virginity', 'Feminism', 'Women']",
  'isAttack': False,
  'isSupport': True},
 {'topArgument': 'Purity pledges lead to people not having a proper understanding of the potential consequences of sex.',
  'subArgument': 'Purity pledges are a facet of abstinence-only sex education [1]. These programs disseminate false information regarding sex.',
  'subject': 'Are Purity Pledges Harmful?',
  'subArgumentLevel': 3,
  'domain': "['Purity', 'Sex', 'Virginity', 'Feminism', 'Women']",
  'isAttack': False,
  'isSupport': True},
 {'topArgument': 'Purity pledges are a facet of abstinence-only sex education [1]. These programs disseminate false information regarding sex.',
  'subArgument

In [53]:
pairs[0]

{'topArgument': 'Purity pledges lead to people having a poorer understanding of sexuality.',
 'subArgument': 'Purity pledges lead to people not having a proper understanding of the potential consequences of sex.',
 'subject': 'Are Purity Pledges Harmful?',
 'subArgumentLevel': 2,
 'domain': "['Purity', 'Sex', 'Virginity', 'Feminism', 'Women']",
 'isAttack': False,
 'isSupport': True}

In [54]:
argSrc = [x["subArgument"] for x in pairs]
argTrg = [x["topArgument"] for x in pairs]
datasetSource = ["kialo" for x in pairs]
topic = [x["domain"] for x in pairs]
relations = ["attack" if x["isAttack"] else "support" for x in pairs]

d = pd.DataFrame.from_dict({
  "topic": topic,
  "relation" : relations,
  "argSrc" : argSrc,
  "argTrg" : argTrg,
  "datasetSource" : datasetSource
})

d

Unnamed: 0,topic,relation,argSrc,argTrg,datasetSource
0,"['Purity', 'Sex', 'Virginity', 'Feminism', 'Wo...",support,Purity pledges lead to people not having a pro...,Purity pledges lead to people having a poorer ...,kialo
1,"['Purity', 'Sex', 'Virginity', 'Feminism', 'Wo...",support,Purity pledges are a facet of abstinence-only ...,Purity pledges lead to people not having a pro...,kialo
2,"['Purity', 'Sex', 'Virginity', 'Feminism', 'Wo...",support,Abstinence programmes are likely to lead to pe...,Purity pledges are a facet of abstinence-only ...,kialo
3,"['Purity', 'Sex', 'Virginity', 'Feminism', 'Wo...",support,Pledgers often believe contraceptives are unre...,Abstinence programmes are likely to lead to pe...,kialo
4,"['Purity', 'Sex', 'Virginity', 'Feminism', 'Wo...",support,Abstinence programmes push against [3] increas...,Abstinence programmes are likely to lead to pe...,kialo
...,...,...,...,...,...
166380,"['Gender', 'VideoGames', 'Media', 'Sexism']",support,Gamers in the community have made [251] explos...,The eleventh installment of Mortal Kombat came...,kialo
166381,"['Gender', 'VideoGames', 'Media', 'Sexism']",attack,"There are ample ads that depict, for example, ...",In so far as the world is aiming for gender eq...,kialo
166382,"['Gender', 'VideoGames', 'Media', 'Sexism']",support,Media industries (including video games) have ...,In so far as the world is aiming for gender eq...,kialo
166383,"['Gender', 'VideoGames', 'Media', 'Sexism']",support,Societies higher in gender equality are less v...,In so far as the world is aiming for gender eq...,kialo


In [55]:
d.to_csv("kialoPairs.csv")