In [1]:
import praw
import pprint
import textwrap
import json

from math import sqrt, log
from hashlib import sha1
from graphviz import Digraph
from credentials import ID, SECRET # you need your won credentials.py to make this work: Bren used his own reddit credentials
from praw.models import MoreComments as more

In [2]:
#prawing the resubreddit threads

reddit = praw.Reddit(
    client_id=ID,
    client_secret=SECRET,
    user_agent="The Rhetor Project"
)

ID = "17llow9"
post = reddit.submission(id=ID)
topic = post.title[5:]
desc = post.selftext
comments = post.comments

In [255]:

# Data preprocessing

from nltk import word_tokenize, sent_tokenize
# nltk.download()
from bs4 import BeautifulSoup as soup
from nltk.sentiment import SentimentIntensityAnalyzer
from colour import Color

# (Inadequate) sentiment analysis (DEPRECATED)
def sentiment(reply):
	sents = sent_tokenize(reply)[0]
	SIA = SentimentIntensityAnalyzer()
	return SIA.polarity_scores(sents)["compound"]

def clean(raw):
	parsed = soup(raw)

	replace = {
		"\u200b": " ",
		"\u2019": "'",
		"\u201c": "\"",
		"\u201d": "\"",
	}

	# Remove all <blockquotes> and <a> tags
	for b in parsed.find_all('blockquote'):
		b.extract()
	
	for a in parsed.find_all('a', href=True):
		text = a.get_text()
		if '://' in text: a.replace_with('')
		else: a.replace_with(text)

	plain = parsed.get_text()
	for k, v in replace.items(): plain = plain.replace(k, v)

	return plain.strip()

lo, hi = Color("#ffffff"), Color("#ffea80")
gradient = list(lo.range_to(hi, 101))

In [256]:
# Pruning reddit comments accoding to a variety of criteria
# Details in the "Score calculation" section below

pruned = [0, 0, 0]
largest = 0
pairs = []

def process(parent, comment):
    cur = {
        "content": clean(comment.body_html),
        "id": comment.id,
        "name": sha1(f"{comment.author.name}".encode()).hexdigest(),
        "depth": comment.depth+1,
        "votes": 0 if comment.score_hidden else abs(comment.score),
        "ranked": True if comment.author_flair_text else False,
        "spicy": bool(comment.controversiality),
        "score": 0,
        "threadScore": 0,
        "below": False, # max # layers below
        "agree": None,
        "replies": []
    }
    
    # Prune
    if not cur["spicy"] and cur["votes"] < 3:
        pruned[0] += 1; return
    # if len(cur["content"]) < 50:
    wordCount = len(word_tokenize(cur["content"]))
    if wordCount < 15: pruned[1] += 1; return

    maxBelow = 0
    for r in comment.replies:
        if isinstance(r, more) or not r.author: continue
        replyThread = process(cur, r)  # Recurse
        if replyThread: 
            cur["replies"].append(replyThread)
            cur["threadScore"] += replyThread["score"]
            if replyThread["below"] is not False:
                if replyThread["below"] > maxBelow: maxBelow = replyThread["below"]

    # Depth/below calc
    if not cur["replies"]:
        cur["below"] = 0 # False -> 0: hit a leaf
        if cur["depth"] < 3: 
            pruned[2] += 1; return
    else:
        cur["below"] = maxBelow + 1

    # Score calculation
    S = 1.2 if cur["spicy"] else 1   # boost controvserial comment scores
    R = 1.4 if cur["ranked"] else 1  # favor "seasoned" commenters
    L = sqrt((wordCount-10)/5)       # encourage longer comments
    D = cur["depth"]                 # boost deep comment scores
    B = (cur["below"]+.5)/2          # devalue no/shallow subthreads

    cur["score"] = S*R*L*D*B*cur["votes"]
    cur["threadScore"] += cur["score"]
    global largest
    if cur["threadScore"] > largest: largest = cur["threadScore"]

    # (Dis)agreement
    # sen = sentiment(cur["content"])
    # cur["agree"] = False if sen < -0.7 else (True if sen > 0.9 else None)

    # Save pairs for classification en masse
    pairs.append({
        "id": cur["id"],
        "pre_text": topic if not parent else parent["content"], 
        # "pre_text": topic if not parent else ''.join(sent_tokenize(parent["content"])[:2]),
        "con_text": cur["content"],
        # "con_text": ''.join(sent_tokenize(cur["content"])[:2]),
    })

    return cur 

# Traverses the threads in a tree-like manner and pre-process the comments
def traverse(comments):
    threads = []
    for c in comments: # Each TLC
        if isinstance(c, more) or c.stickied: continue
        if c.author is None: continue
        thread = process({}, c)
        if thread: 
            threads.append(thread)
            # print(f"{thread['threadScore']:0.2f}")
    return threads

# Pre-processed threads tree
tree = {
    "root": {
        "id": ID,
        "content": topic,
        "replies": traverse(comments)
    }
}


In [None]:
# THIS BLOCK IS DEPRECATED
# This was our attempt to apply a summarizer to these pruned comments

from transformers import pipeline
summarizer = pipeline("summarization", model="stevied67/pegasus-subreddit-comments-summarizer", device = 0)
summPairs = []; summIn = [] 

for p in pairs: # Summarizer wants a flat list of pre/con pairs
     summIn.append(p["pre_text"])
     summIn.append(p["con_text"])
     # print(p["pre_text"], p["con_text"], "\n\n")

print("Summarizing content...")
summOut = summarizer(summIn, max_length = 32)
print("Done")


# i = 0
# for p in pairs:
#     summPairs.append({
#         "id": p["id"],
#         "pre_text": summOut[i]['summary_text'],
#         "con_text": summOut[i+1]['summary_text'],
#     })
#     i += 2

i = 0
summs = {}
for p in pairs:
    summs[p['id']] = summOut[i+1]['summary_text'],
    i += 2

# for p in summPairs:
#      print(p["pre_text"], p["con_text"], "\n\n")


In [225]:
# THIS BLOCK IS DEPRECATED
# This was our attempt to apply the logbert model for inference
# Original code from here: https://github.com/yohanjo/tacl_arg_rel

cfg = {
    "blend_warmup": True,
    "blend_type": "no",
    "blend_rate": [0] * 100,
    "train_blend_warmup": False,
    "n_epochs_blend_only": 5,

    "debug_mode": False, #True,

    "device": "cuda",
    "pt_model": "bert-base-uncased",
    "n_trials": 1,
    "n_epochs": 50,
    "learn_rate": 1e-5,
    "epsilon": 1e-8,
    "pivot_metric": "auc",
    "early_stop": 3,
    "tasks": {
        "main": {
            "classes": ["sup", "att", "neu"] 
        },
        "nli": {
            "data_paths": ["data/rsc/mnli-org.csv",
                           "data/rsc/antsyn-nli.csv"],
            "classes": ["ent", "con", "neu"],
        },
        "senti": {
            "data_paths": ["data/rsc/senti-irish.csv",
                           "data/rsc/senti-ldong.csv",
                           "data/rsc/senti-mm.csv",
                           "data/rsc/senti-sem17.csv",
                           "data/rsc/senti-norm.csv"],
            "classes": ["pos", "neg", "neu"],
        },
        "causal": {
            "data_paths": ["data/rsc/because-causal.csv",
                           "data/rsc/conet-causal.csv",
                           "data/rsc/pdtb-i-causal.csv",
                           "data/rsc/wiqa-causal.csv"],
            "classes": ["cause", "obstruct", "precede", "sync", "else"],
        },
        "normarg_polar": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["consist", "contrast"],
        },
        "normarg_jtype": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["norm", "conseq"],
        },
        "normarg_senti": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["positive", "negative"],
            "con_classes": ["advocate", "object"],
        },
 
    },
    "max_n_batch_tokens": 512*5,
    "max_batch_size": 6,
    "data_dir": None,
    "logs_dir": None,
    "save_model": False,
    "models_dir": "models",
    "rel2label": {
            "1": 0,  # support
            "-1": 1,  # attack
            "0": 2  # neutral
    }
}

tasks = ["main"] + ["nli", "senti", "normarg"]


cfg["task2classes"] = {}
for task in tasks:
    if task == "normarg":
        cfg["task2classes"]["normarg_polar"] = cfg["tasks"]["normarg_polar"]["classes"]
        cfg["task2classes"]["normarg_jtype"] = cfg["tasks"]["normarg_jtype"]["classes"]
        cfg["task2classes"]["normarg_norm_senti"] = cfg["tasks"]["normarg_senti"]["classes"]
        cfg["task2classes"]["normarg_conseq_senti"] = cfg["tasks"]["normarg_senti"]["classes"]
    else:
        cfg["task2classes"][task] = cfg["tasks"][task]["classes"]

In [None]:
# THIS BLOCK IS DEPRECATED
# This was our attempt to apply the logbert model for inference
# Original code from here: https://github.com/yohanjo/tacl_arg_rel


from models import *
import os

MODEL = 'trained.model'

assert os.path.exists(MODEL), f"Model not found: {MODEL}"
# Prepare the trainer
trainer = Trainer(cfg)

# Warmup with blend tasks
init_model = torch.load(MODEL, map_location='cuda')
trainer.init_model(init_model)
trainer.model = trainer.model.to('cuda')

# Prediction
keys = ["id", "con_text", "pre_text", "label_prob", "label_pred"]

print(f"Loading data...")
# batches = trainer.make_batches(summPairs, ["pre", "con"]) 
batches = trainer.make_batches(pairs, ["pre", "con"])

print(f"Classifying {sum(len(b["id"]) for b in batches)} replies over {len(batches)} batches:")

trainer.run_epoch(batches, "predict", "main")

# for batch in batches:
#     for i, id in enumerate(batch["id"]):
#         print(batch["pre_text"][i][:20],'\t',batch["con_text"][i][:20],'\t',batch["label_prob"][i],'\t',batch["label_pred"][i])
res = {ID: [b["label_pred"][i], b["label_prob"][i], b["pre_text"][i], b["con_text"][i], summs[ID][0]] for b in batches for i, ID in enumerate(b["id"])}
# for ID in res:
#     print(f"{ID}: {res[ID]}")

In [9]:
# Using LLaMA and prompt engineering to perform argumentative relation classification (ARC)

from ctransformers import AutoModelForCausalLM

PATH = "/home/bren/proj/F23_Rhetors/"
MODEL = "llama-2-13b-chat.Q5_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Llama-2-13B-chat-GGUF",
    model_file=f"{PATH}{MODEL}",
    model_type="llama",
    context_length=2048,
    gpu_layers=100,
	max_new_tokens=64,
    temperature=0.9,
)

for r in res:
    prompt = f"""
    P1:
    {res[r][2]}
    P2:
    {res[r][3]}
    """
    # print(prompt)


    IN = f"""
    [INST] <<SYS>>
    Compare the statements made by P1 and P2. Does P2 AGREE or DISAGREE? Respond with your decision.
    <</SYS>>
    {prompt}[/INST]
    """

    res[r].append(str(llm(IN)))


Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 21732.15it/s]
Fetching 0 files: 0it [00:00, ?it/s]


In [215]:
# Load model results
RES = None
with open("res.json", 'r') as f: RES = json.load(f)

In [None]:
# Using LLaMA and prompt engineering to perform argumentative relation classification (ARC)

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path=f"{PATH}{MODEL}",
    n_gpu_layers=100,
    n_batch=256,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

template = """
[INST] <<SYS>>
Compare the statements made by P1 and P2. Does P2 AGREE or DISAGREE? Respond with your decision.
<</SYS>>
P1: {p1}
P2: {p2}[/INST]
"""

p1 = """
...
"""

p2 = """
...
"""

prompt = PromptTemplate(template=template, input_variables=["p1", "p2"])

llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain.run({'p1':p1, 'p2':p2})

In [257]:
# Populating the tree with valid argument flows based on argumentative relation classification results

agrees = [
    "p2 agree",
    "both",
    "similar"
]

disagrees = [
    "p2 disagree",
    "disagrees",
    "different opinion",
    "different perspective",
    "different views",
]

vis = Digraph(format='png')

def build(parID, cur):
    # print(res[cur["id"]])
    g = int(25*(cur["threadScore"]/largest*100)**(.25)+0)
    
    pred, prob, parReply, curReply, summ, outLLM = RES[cur["id"]]
    prob = [round(x,3) for x in prob]
    resLLM = False if any(x in outLLM.lower() for x in disagrees) \
             else (True if any(x in outLLM.lower() for x in agrees) and "disagree" not in outLLM.lower() else True) # Default to agree

    cur["agree"] = True if (prob[0] > prob[1]) else False
    resLLM = False if prob[1] > 0.75 else resLLM

    # print(outLLM) if 'different' in outLLM and resLLM is True else None
    content = textwrap.fill(cur["content"], 25)[:200] + ('...' if len(cur["content"]) > 200 else '')
    outLLM = textwrap.fill(outLLM, 25)
    summ = textwrap.fill(summ, 25)
    label = f"{cur['threadScore']:.2f} ({cur['score']:.2f}){f"\nReclassified {cur['agree']}→{resLLM}" if cur['agree'] is not resLLM else ""} \n{prob}\n\n{content}" #\n\n{summ}\n\n{outLLM}
    vis.node(cur["id"], label=label, shape='box', style='filled', fillcolor=gradient[g].hex_l)

    edge_color = 'black' if resLLM is None else ('green' if resLLM else 'red')
    edge_label = '?' if resLLM is None else ('Agree' if resLLM else 'Disagree')

    vis.edge(parID, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)
    cur["agree"] = resLLM

    # Recurse for the replies
    for r in cur["replies"]:
        build(cur["id"], r)

# Add a root node to the graph with the topic
root = tree["root"]
vis.node(root["id"], label=topic, shape='box')

for r in root['replies']:
    build(root["id"], r)

In [258]:
# Renders the current tree as it using graphviz

pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)
# pp.pprint(tree)
json.dump(tree, open("tree.json", "w"), indent=4, sort_keys=False)

print(
	"Pruned comments:\n"
	f"\t{pruned[0]} with low engagement\n"
	f"\t{pruned[1]} were too short\n"
	f"\t{pruned[2]} from shallow threads\n"
)

vis.render('tree', view=True)

Pruned comments:
	43 with low engagement
	26 were too short
	49 from shallow threads



'tree.png'

F: Not sharing "/usr/share/icons" with sandbox: Path "/usr" is reserved by Flatpak


In [295]:
# Rebuild tree without "agree" nodes and crudely recalculate scores

# Load initial tree results
pre = None
with open("tree.json", 'r') as f: pre = json.load(f)
preVis = Digraph(format='png')

def rescore(parID, cur):
    # lookahead
    loss = sum(r["threadScore"] for r in cur["replies"] if r["agree"])
    cur["replies"] = [r for r in cur["replies"] if not r["agree"]]
    # print(f"-{loss}") if loss != 0 else None

    # Recurse for the replies
    for r in cur["replies"]:
        loss += rescore(cur["id"], r)

    cur["threadScore"] -= loss
    cur["score"] -= 0.5*loss

    g = int(25*(cur["threadScore"]/largest*100)**(.25)+0)

    content = textwrap.fill(cur["content"], 25)[:300] + ('...' if len(cur["content"]) > 300 else '')
    label = f"{cur['threadScore']:.2f}{f" (-{loss:.2f})" if loss != 0 else ""}\n\n{content}"
    preVis.node(cur["id"], label=label, shape='box', style='filled', fillcolor=gradient[g].hex_l)

    edge_color = 'black' if cur["agree"] is None else ('green' if cur["agree"] else 'red')
    edge_label = '?' if cur["agree"] is None else ('Agree' if cur["agree"] else 'Disagree')

    preVis.edge(parID, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)

    return loss

preVis.node(root["id"], label=topic, shape='box')
root = pre["root"]
for tlc in root["replies"]:
    rescore(root["id"], tlc)

preVis.render('pre', view=True)

'pre.png'

F: Not sharing "/usr/share/icons" with sandbox: Path "/usr" is reserved by Flatpak


In [298]:
# Finding the paths and assigning each comment to either agent A or agent B

paths = pre.copy()
pathVis = Digraph(format='png')

def findPaths(parID, cur):
    if cur["agree"] and parID != root["id"]: return
    if cur["depth"] % 2 == 1: cur["agent"] = 'A'
    else: cur["agent"] = 'B'
    cur["replies"].sort(key=lambda x: (0.6*x["threadScore"]+0.4*x["score"]), reverse=True)
    g = int(25*(cur["threadScore"]/largest*100)**(.25)+0)

    
    label = f"{cur["agent"]}:\n\n{textwrap.fill(cur["content"], 50)[:2000]}{'...' if len(cur["content"]) > 2000 else ''}"
    pathVis.node(cur["id"], label=label, shape='box', style='filled', fillcolor=gradient[g].hex_l)

    edge_color = 'black' if cur["agree"] is None else ('green' if cur["agree"] else 'red')
    edge_label = '?' if cur["agree"] is None else ('Agree' if cur["agree"] else 'Disagree')

    pathVis.edge(parID, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)

    # Recurse for the replies
    if cur["replies"]:
        # print(cur["replies"])
        findPaths(cur["id"], cur["replies"][0])


pathVis.node(root["id"], label=topic, shape='box')
root = paths["root"]
root["replies"].sort(key=lambda x: (0.6*x["threadScore"]+0.4*x["score"]), reverse=True)
for tlc in root["replies"][:6]: # explore top 6 threads
    findPaths(root["id"], tlc)
    # print(tlc["threadScore"])

pathVis.render('paths', view=True)


'paths.png'

F: Not sharing "/usr/share/icons" with sandbox: Path "/usr" is reserved by Flatpak


In [300]:
# Generating a JSON script that can be fed into "TTS_bark.ipynb"

exclude = [1]; debates = []

def build(r):
    return {
        "agent": r["agent"],
        "response": r["content"],
        "next": build(r["replies"][0]) if r["replies"] else None

    }

for i, r in enumerate(root["replies"]):
    if i == 6: break
    if i in exclude: continue
    debates.append(build(r))

script = {
    "topic": topic,
    "debates": debates
}

# pp.pprint(script)
json.dump(script, open("script.json", "w"), indent=4, sort_keys=False)