In [None]:
# import post
import praw

post_id = 'fj4czk'
reddit = praw.Reddit("bot1")
submission = reddit.submission(id=post_id)

print(f"Title: {submission.title}")
print(f"Author: {submission.author.name}")
print(f"Score: {submission.score}")
print(f"ID: {submission.id}")
print(f"Subreddit: r/{submission.subreddit.display_name}")
print(f"URL: {submission.url}")
print(f"Number of Comments: {submission.num_comments}")

In [None]:
# Indented
def linearize_tree_to_xml(submission, base_indentation="  "):
    output_lines = []
    op_author = submission.author.name if submission.author else "[deleted]"

    # Post block
    output_lines.append(f'<C id="C1" author="{op_author}">')
    # output_lines.append(f'<C id="C1">')
    output_lines.append(f'{base_indentation}<T>{submission.title.strip()}</T>')
    body = submission.selftext.strip()
    if body:
        output_lines.append(f'{base_indentation}{body}')
    output_lines.append("</C>")

    submission.comments.replace_more(limit=None)

    def process_comment(comment, id_prefix, parent_id="C1", depth=1):
        if isinstance(comment, praw.models.MoreComments):
            return

        author_name = comment.author.name if comment.author else "[deleted]"
        comment_id = f"C{id_prefix}"
        indentation = base_indentation * depth

        # Comment block
        output_lines.append(f'{indentation}<C id="{comment_id}" author="{author_name}" parent="{parent_id}">')
        # output_lines.append(f'{indentation}<C id="{comment_id}" parent="{parent_id}">')
        body_text = comment.body.strip()
        if body_text:
            output_lines.append(f'{indentation}{base_indentation}{body_text}')
        output_lines.append(f"{indentation}</C>")

        # Replies
        for i, reply in enumerate(comment.replies, 1):
            new_id_prefix = f"{id_prefix}.{i}"
            process_comment(reply, new_id_prefix, parent_id=comment_id, depth=depth + 1)

    for i, top_level_comment in enumerate(submission.comments, 2):
        process_comment(top_level_comment, str(i))

    return "\n".join(output_lines)


llm_transcript = linearize_tree_to_xml(submission)
# print(len(llm_transcript))
print(llm_transcript)

In [None]:
# Indented + HTML parser
import praw
import html # Import the html library for escaping

def linearize_tree_to_xml(submission, base_indentation="  "):
    output_lines = []
    op_author = submission.author.name if submission.author else "[deleted]"

    # Post block
    output_lines.append(f'<P id="P1" author="{op_author}">')
    output_lines.append(f'{base_indentation}<T>{html.escape(submission.title.strip())}</T>')
    body = submission.selftext.strip()
    if body:
        output_lines.append(f'{base_indentation}{html.escape(body)}')
    output_lines.append("</P>")

    submission.comments.replace_more(limit=None)

    def process_comment(comment, id_prefix, parent_id="P1", depth=1):
        if isinstance(comment, praw.models.MoreComments):
            return

        author_name = comment.author.name if comment.author else "[deleted]"
        comment_id = f"C{id_prefix}"
        indentation = base_indentation * depth

        # Comment block
        output_lines.append(f'{indentation}<C id="{comment_id}" author="{author_name}" parent="{parent_id}">')
        body_text = comment.body.strip()
        if body_text:
            output_lines.append(f'{indentation}{base_indentation}{html.escape(body_text)}')
        output_lines.append(f"{indentation}</C>")

        # Replies
        for i, reply in enumerate(comment.replies, 1):
            new_id_prefix = f"{id_prefix}.{i}"
            process_comment(reply, new_id_prefix, parent_id=comment_id, depth=depth + 1)

    for i, top_level_comment in enumerate(submission.comments, 1):
        process_comment(top_level_comment, str(i), parent_id="P1", depth=1)

    return "\n".join(output_lines)

llm_transcript = linearize_tree_to_xml(submission)
print(len(llm_transcript))
print(llm_transcript)

In [None]:
# Not indented
def linearize_tree_to_xml(submission):
    output_lines = []
    op_author = submission.author.name if submission.author else "[deleted]"

    # Post block
    output_lines.append(f'<P id="P1" author="{op_author}">')
    output_lines.append(f'<T>{submission.title.strip()}</T>')
    body = submission.selftext.strip()
    if body:
        output_lines.append(body)
    output_lines.append("</P>")

    submission.comments.replace_more(limit=None)

    def process_comment(comment, id_prefix, parent_id="P1"):
        if isinstance(comment, praw.models.MoreComments):
            return

        author_name = comment.author.name if comment.author else "[deleted]"
        comment_id = f"C{id_prefix}"
        body_text = comment.body.strip() if comment.body else "[deleted]"

        # Comment block
        output_lines.append(f'<C id="{comment_id}" author="{author_name}" parent="{parent_id}">')
        output_lines.append(body_text)
        output_lines.append("</C>")

        # Replies
        for i, reply in enumerate(comment.replies, 1):
            new_id_prefix = f"{id_prefix}.{i}"
            process_comment(reply, new_id_prefix, parent_id=comment_id)

    for i, top_level_comment in enumerate(submission.comments, 1):
        process_comment(top_level_comment, str(i), parent_id="P1")

    return "\n".join(output_lines)

llm_transcript = linearize_tree_to_xml(submission)
print(len(llm_transcript))
print(llm_transcript)