In [103]:
import praw
import pprint
import textwrap
import json

from math import sqrt, log
from hashlib import sha1
from graphviz import Digraph
from credentials import ID, SECRET
from praw.models import MoreComments as more

In [8]:
reddit = praw.Reddit(
    client_id=ID,
    client_secret=SECRET,
    user_agent="The Rhetor Project"
)

post = reddit.submission(id="17llow9")
topic = post.title[5:]
desc = post.selftext
comments = post.comments

In [149]:
from nltk import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup as soup
from nltk.sentiment import SentimentIntensityAnalyzer

# (Inadequate) sentiment analysis
def sentiment(reply):
	sents = sent_tokenize(reply)[0]
	SIA = SentimentIntensityAnalyzer()
	return SIA.polarity_scores(sents)["compound"]

def clean(raw):
	parsed = soup(raw)

	# Remove all <blockquotes>
	for b in parsed.find_all('blockquote'):
		b.extract()
	
	return parsed.get_text()


In [228]:
pruned = [0, 0, 0]
vis = Digraph(format='png')

def process(parent, comment):
    cur = {
        "content": clean(comment.body_html),
        "id": comment.id,
        "name": sha1(f"{comment.author.name}".encode()).hexdigest(),
        "depth": comment.depth+1,
        "votes": 0 if comment.score_hidden else abs(comment.score),
        "ranked": True if comment.author_flair_text else False,
        "spicy": bool(comment.controversiality),
        "score": 0,
        "threadScore": 0,
        "below": False, # max # layers below
        "agree": None,
        "replies": []
    }
    
    # Prune
    if not cur["spicy"] and cur["votes"] < 3:
        pruned[0] += 1; return
    # if len(cur["content"]) < 50:
    wordCount = len(word_tokenize(cur["content"]))
    if wordCount < 15: pruned[1] += 1; return

    maxBelow = 0
    for r in comment.replies:
        if isinstance(r, more) or not r.author: continue
        replyThread = process(cur["id"], r)  # Recurse
        if replyThread: 
            cur["replies"].append(replyThread)
            cur["threadScore"] += replyThread["score"]
            if replyThread["below"] is not False:
                if replyThread["below"] > maxBelow: maxBelow = replyThread["below"]


    # Depth/below calc
    if not cur["replies"]:
        cur["below"] = 0 # False -> 0: hit a leaf
        if cur["depth"] < 3: 
            pruned[2] += 1; return
    else:
        cur["below"] = maxBelow + 1

    # Score calculation
    S = 1.2 if cur["spicy"] else 1   # boost controvserial comment scores
    R = 1.4 if cur["ranked"] else 1  # favor "seasoned" commenters
    L = sqrt((wordCount-10)/5)       # encourage longer comments
    # D = sqrt(cur["depth"])         # boost deep comment scores
    B = (cur["below"]+.5)/2           # devalue no/shallow subthreads

    cur["score"] = S*R*L*B*cur["votes"]
    cur["threadScore"] += cur["score"]


    # (Dis)agreement
    # sen = sentiment(cur["content"])
    # cur["agree"] = False if sen < -0.7 else (True if sen > 0.9 else None)

    vis.node(cur["id"], label=cur["content"], shape='box') # Add node
    vis.node(cur["id"], label=f"{cur['threadScore']:.2f} ({cur['score']:.2f})\n{S:.1f}, {R:.1f}, {L:.1f}, {B:.1f}\n{textwrap.fill(cur['content'], 25)[:100]}{'...' if len(cur['content'])>50 else ''}", shape='box')

    edge_color = 'black' if cur["agree"] is None else ('green' if cur["agree"] else 'red')
    edge_label = '?' if cur["agree"] is None else ('Agree' if cur["agree"] else 'Disagree')
    
    # Create edge 
    vis.edge(parent, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)

    return cur 


def traverse(comments):
    threads = []
    vis.node("root", label=topic, shape='box')  # Add root
    for c in comments: # Each TLC
        if isinstance(c, more) or c.stickied: continue
        if c.author is None: continue
        thread = process("root", c)
        if thread: 
            threads.append(thread)
            # print(f"{thread['threadScore']:0.2f}")
    return threads

tree = {
    "root": {
        "content": topic,
        "replies": traverse(comments)
    }
}


In [229]:
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)
# pp.pprint(tree)
json.dump(tree, open("tree.json", "w"), indent=4, sort_keys=False)

print(
	"Pruned comments:\n"
	f"\t{pruned[0]} with low engagement\n"
	f"\t{pruned[1]} were too short\n"
	f"\t{pruned[2]} from shallow threads\n"
)

vis.render('tree', view=True)

Pruned comments:
	40 with low engagement
	27 were too short
	49 from shallow threads



'tree.png'