In [1]:
import re
import json
import xml.etree.ElementTree as ET
from collections import defaultdict

In [10]:
base_dir='../data/original/'
semeta='softwareengineering.meta.stackexchange.com/Posts.xml'
se_posts='softwareengineering.stackexchange.com/Posts.xml'
se_comments='softwareengineering.stackexchange.com/Comments.xml'
secsmeta='softwarerecs.meta.stackexchange.com/Posts.xml'
secs_posts='softwarerecs.stackexchange.com/Posts.xml'
secs_comments='softwarerecs.stackexchange.com/Comments.xml'
ai='ai.stackexchange.com/Posts.xml'
ds_posts='datascience.stackexchange.com/Posts.xml'
ds_comments='datascience.stackexchange.com/Comments.xml'

In [3]:
def find_comment(directory):
    comment_tree=ET.parse(directory)
    root=comment_tree.getroot()
    comment_dict=defaultdict(list)
    for child in root:
        cur_row=child.attrib
        comment_dict[cur_row['PostId']].append(cur_row)
    return comment_dict

In [4]:
def find_related_questioins(root):
    '''
    Find question id related to aws/amazon(using tag information)
    
    Args:
        root: xml root
    Return:
        question_ids:a list of question ids
        full_dict: id and content dict
    '''
    question_ids=[]
    for child in root:        
        cur_row=child.attrib
        if 'Tags' in cur_row:
            if 'aws' in cur_row['Tags'] or 'amazon' in cur_row['Tags']:
                question_ids.append(cur_row['Id'])
    return question_ids

In [5]:
def find_answers(root,question_ids,comment_dict):
    '''
    Find answers related to the target question
    
    Args:
        root: xml root
        question_ids: a list of target question id
    
    Return:
        question_answer_dict: a dict of target question and corresponding answers
    '''
    question_answer_dict=defaultdict(dict)
    for child in root:
        cur_row=child.attrib
        if cur_row['PostTypeId']=='2' and cur_row['ParentId'] in question_ids:
            question_answer_dict[cur_row['ParentId']][cur_row['Id']]=cur_row
            question_answer_dict[cur_row['ParentId']][cur_row['Id']]['comments']=comment_dict[cur_row['Id']]
    return question_answer_dict

In [6]:
def parse_xml(post_dir,com_dir,res):
    comment_dict=find_comment(com_dir)
    
    post_tree=ET.parse(post_dir)
    post_root=post_tree.getroot()
    question_ids=find_related_questioins(post_root)
    question_answer_dict=find_answers(post_root,question_ids,comment_dict)
    
    for child in post_root:
        cur_row=child.attrib
        each=cur_row
        if cur_row['Id'] in question_ids:
            each['Tags']=re.findall(r'\<(.*?)\>',cur_row['Tags'])
            each['answers']=question_answer_dict[cur_row['Id']]
            each['comments']=comment_dict[cur_row['Id']]
            res.append(each)
    return res

In [7]:
def to_jsonl(target,dest_file):
    with open(dest_file, 'w') as outfile:
        for entry in target:
            json.dump(entry, outfile)
            outfile.write('\n')

In [12]:
res=[]
res=parse_xml(base_dir+se_posts,base_dir+se_comments,res)
res=parse_xml(base_dir+secs_posts,base_dir+secs_comments,res)
res=parse_xml(base_dir+ds_posts,base_dir+ds_comments,res)
to_jsonl(res,'../data/se_cs_ds.jsonl')