In [1]:
import praw
import pprint
import textwrap
import json

from math import sqrt, log
from hashlib import sha1
from graphviz import Digraph
from credentials import ID, SECRET
from praw.models import MoreComments as more

In [2]:
reddit = praw.Reddit(
    client_id=ID,
    client_secret=SECRET,
    user_agent="The Rhetor Project"
)

ID = "17llow9"
post = reddit.submission(id=ID)
topic = post.title[5:]
desc = post.selftext
comments = post.comments

In [3]:
from nltk import word_tokenize, sent_tokenize
# nltk.download()
from bs4 import BeautifulSoup as soup
from nltk.sentiment import SentimentIntensityAnalyzer
from colour import Color

# (Inadequate) sentiment analysis
def sentiment(reply):
	sents = sent_tokenize(reply)[0]
	SIA = SentimentIntensityAnalyzer()
	return SIA.polarity_scores(sents)["compound"]

def clean(raw):
	parsed = soup(raw)

	# Remove all <blockquotes>
	for b in parsed.find_all('blockquote'):
		b.extract()
	
	return parsed.get_text()

lo, hi = Color("#ffffff"), Color("#ffea80")
gradient = list(lo.range_to(hi, 101))

In [4]:
pruned = [0, 0, 0]
largest = 0
pairs = []

def process(parent, comment):
    cur = {
        "content": clean(comment.body_html),
        "id": comment.id,
        "name": sha1(f"{comment.author.name}".encode()).hexdigest(),
        "depth": comment.depth+1,
        "votes": 0 if comment.score_hidden else abs(comment.score),
        "ranked": True if comment.author_flair_text else False,
        "spicy": bool(comment.controversiality),
        "score": 0,
        "threadScore": 0,
        "below": False, # max # layers below
        "agree": None,
        "replies": []
    }
    
    # Prune
    if not cur["spicy"] and cur["votes"] < 3:
        pruned[0] += 1; return
    # if len(cur["content"]) < 50:
    wordCount = len(word_tokenize(cur["content"]))
    if wordCount < 15: pruned[1] += 1; return

    maxBelow = 0
    for r in comment.replies:
        if isinstance(r, more) or not r.author: continue
        replyThread = process(cur, r)  # Recurse
        if replyThread: 
            cur["replies"].append(replyThread)
            cur["threadScore"] += replyThread["score"]
            if replyThread["below"] is not False:
                if replyThread["below"] > maxBelow: maxBelow = replyThread["below"]

    # Depth/below calc
    if not cur["replies"]:
        cur["below"] = 0 # False -> 0: hit a leaf
        if cur["depth"] < 3: 
            pruned[2] += 1; return
    else:
        cur["below"] = maxBelow + 1

    # Score calculation
    S = 1.2 if cur["spicy"] else 1   # boost controvserial comment scores
    R = 1.4 if cur["ranked"] else 1  # favor "seasoned" commenters
    L = sqrt((wordCount-10)/5)       # encourage longer comments
    D = cur["depth"]                 # boost deep comment scores
    B = (cur["below"]+.5)/2          # devalue no/shallow subthreads

    cur["score"] = S*R*L*D*B*cur["votes"]
    cur["threadScore"] += cur["score"]
    global largest
    if cur["threadScore"] > largest: largest = cur["threadScore"]

    # (Dis)agreement
    # sen = sentiment(cur["content"])
    # cur["agree"] = False if sen < -0.7 else (True if sen > 0.9 else None)

    # Save pairs for classification en masse
    pairs.append({
        "id": cur["id"],
        "pre_text": topic if not parent else parent["content"], 
        # "pre_text": topic if not parent else ''.join(sent_tokenize(parent["content"])[:2]),
        "con_text": cur["content"],
        # "con_text": ''.join(sent_tokenize(cur["content"])[:2]),
    })

    return cur 


def traverse(comments):
    threads = []
    for c in comments: # Each TLC
        if isinstance(c, more) or c.stickied: continue
        if c.author is None: continue
        thread = process({}, c)
        if thread: 
            threads.append(thread)
            # print(f"{thread['threadScore']:0.2f}")
    return threads

tree = {
    "root": {
        "id": ID,
        "content": topic,
        "replies": traverse(comments)
    }
}


In [46]:
from transformers import pipeline
summarizer = pipeline("summarization", model="stevied67/pegasus-subreddit-comments-summarizer", device = 0)
summPairs = []; summIn = [] 

for p in pairs: # Summarizer wants a flat list of pre/con pairs
     summIn.append(p["pre_text"])
     summIn.append(p["con_text"])
     # print(p["pre_text"], p["con_text"], "\n\n")

print("Summarizing content...")
summOut = summarizer(summIn, max_length = 16)
print("Done")


i = 0
for p in pairs:
    summPairs.append({
        "id": p["id"],
        "pre_text": summOut[i]['summary_text'],
        "con_text": summOut[i+1]['summary_text'],
    })
    i += 2

# for p in summPairs:
#      print(p["pre_text"], p["con_text"], "\n\n")


Summarizing content...
Done


In [41]:
cfg = {
    "blend_warmup": True,
    "blend_type": "no",
    "blend_rate": [0] * 100,
    "train_blend_warmup": False,
    "n_epochs_blend_only": 5,

    "debug_mode": False, #True,

    "device": "cuda",
    "pt_model": "bert-base-uncased",
    "n_trials": 1,
    "n_epochs": 50,
    "learn_rate": 1e-5,
    "epsilon": 1e-8,
    "pivot_metric": "auc",
    "early_stop": 3,
    "tasks": {
        "main": {
            "classes": ["sup", "att", "neu"] 
        },
        "nli": {
            "data_paths": ["data/rsc/mnli-org.csv",
                           "data/rsc/antsyn-nli.csv"],
            "classes": ["ent", "con", "neu"],
        },
        "senti": {
            "data_paths": ["data/rsc/senti-irish.csv",
                           "data/rsc/senti-ldong.csv",
                           "data/rsc/senti-mm.csv",
                           "data/rsc/senti-sem17.csv",
                           "data/rsc/senti-norm.csv"],
            "classes": ["pos", "neg", "neu"],
        },
        "causal": {
            "data_paths": ["data/rsc/because-causal.csv",
                           "data/rsc/conet-causal.csv",
                           "data/rsc/pdtb-i-causal.csv",
                           "data/rsc/wiqa-causal.csv"],
            "classes": ["cause", "obstruct", "precede", "sync", "else"],
        },
        "normarg_polar": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["consist", "contrast"],
        },
        "normarg_jtype": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["norm", "conseq"],
        },
        "normarg_senti": {
            "data_path": "data/rsc/normarg.csv",
            "classes": ["positive", "negative"],
            "con_classes": ["advocate", "object"],
        },
 
    },
    "max_n_batch_tokens": 512*5,
    "max_batch_size": 6,
    "data_dir": None,
    "logs_dir": None,
    "save_model": False,
    "models_dir": "models",
    "rel2label": {
            "1": 0,  # support
            "-1": 1,  # attack
            "0": 2  # neutral
    }
}

tasks = ["main"] + ["nli", "senti", "normarg"]


cfg["task2classes"] = {}
for task in tasks:
    if task == "normarg":
        cfg["task2classes"]["normarg_polar"] = cfg["tasks"]["normarg_polar"]["classes"]
        cfg["task2classes"]["normarg_jtype"] = cfg["tasks"]["normarg_jtype"]["classes"]
        cfg["task2classes"]["normarg_norm_senti"] = cfg["tasks"]["normarg_senti"]["classes"]
        cfg["task2classes"]["normarg_conseq_senti"] = cfg["tasks"]["normarg_senti"]["classes"]
    else:
        cfg["task2classes"][task] = cfg["tasks"][task]["classes"]

In [47]:
from models import *
import os

MODEL = 'trained.model'

assert os.path.exists(MODEL), f"Model not found: {MODEL}"
# Prepare the trainer
trainer = Trainer(cfg)

# Warmup with blend tasks
init_model = torch.load(MODEL, map_location='cuda')
trainer.init_model(init_model)
trainer.model = trainer.model.to('cuda')

# Prediction
keys = ["id", "con_text", "pre_text", "label_prob", "label_pred"]

print(f"Loading data...")
batches = trainer.make_batches(summPairs, ["pre", "con"])

print(f"Classifying {sum(len(b["id"]) for b in batches)} replies over {len(batches)} batches:")

trainer.run_epoch(batches, "predict", "main")

# for batch in batches:
#     for i, id in enumerate(batch["id"]):
#         print(batch["pre_text"][i][:20],'\t',batch["con_text"][i][:20],'\t',batch["label_prob"][i],'\t',batch["label_pred"][i])
res = {ID: [b["label_pred"][i], b["label_prob"][i], b["pre_text"][i], b["con_text"][i]] for b in batches for i, ID in enumerate(b["id"])}
# for ID in res:
#     print(f"{ID}: {res[ID]}")

Loading pretrained model...
Loading data...
Classifying 92 replies over 16 batches:


100%|██████████| 16/16 [00:00<00:00, 75.21it/s]


In [49]:
vis = Digraph(format='png')

def build(parID, cur):
    # print(res[cur["id"]])
    g = int(25*(cur["threadScore"]/largest*100)**(.25)+0)
    
    pred, prob, parReply, curReply = res[cur["id"]]
    prob = [round(x,3) for x in prob]
    label = f"{cur['threadScore']:.2f} ({cur['score']:.2f})\n{prob}\n{textwrap.fill(cur["content"], 25)[:200]}{'...' if len(cur["content"]) > 200 else ''}\n\n{textwrap.fill(curReply, 25)}"
    vis.node(cur["id"], label=label, shape='box', style='filled', fillcolor=gradient[g].hex_l)

    # cur["agree"] = None if pred == 2 else not bool(pred)
    cur["agree"] = True if (prob[0] > prob[1]) else False
    edge_color = 'black' if cur["agree"] is None else ('green' if cur["agree"] else 'red')
    edge_label = '?' if cur["agree"] is None else ('Agree' if cur["agree"] else 'Disagree')

    vis.edge(parID, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)

    # Recurse for the replies
    for r in cur["replies"]:
        build(cur["id"], r)

# Add a root node to the graph with the topic
root = tree["root"]
vis.node(root["id"], label=topic, shape='box')

for r in root['replies']:
    build(root["id"], r)

In [50]:
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)
# pp.pprint(tree)
json.dump(tree, open("tree.json", "w"), indent=4, sort_keys=False)

print(
	"Pruned comments:\n"
	f"\t{pruned[0]} with low engagement\n"
	f"\t{pruned[1]} were too short\n"
	f"\t{pruned[2]} from shallow threads\n"
)

vis.render('tree', view=True)

Pruned comments:
	41 with low engagement
	27 were too short
	48 from shallow threads



F: Not sharing "/usr/share/icons" with sandbox: Path "/usr" is reserved by Flatpak


'tree.png'




(loupe:2): GLib-GObject-CRITICAL **: 22:47:16.826: g_object_weak_unref: couldn't find weak ref 0x7f44fe152fb0((nil))


In [51]:
script = tree
vis2 = Digraph(format='png')

def build(parID, cur):
    if cur["agree"]: return
    cur["replies"].sort(key=lambda x: x["threadScore"], reverse=True)
    g = int(25*(cur["threadScore"]/largest*100)**(.25)+0)
    
    label = f"{textwrap.fill(cur["content"], 50)[:1000]}{'...' if len(cur["content"]) > 1000 else ''}"
    vis2.node(cur["id"], label=label, shape='box', style='filled', fillcolor=gradient[g].hex_l)

    edge_color = 'black' if cur["agree"] is None else ('green' if cur["agree"] else 'red')
    edge_label = '?' if cur["agree"] is None else ('Agree' if cur["agree"] else 'Disagree')

    vis2.edge(parID, cur["id"], label=edge_label, color=edge_color, fontcolor=edge_color)

    # Recurse for the replies
    if cur["replies"]:
        build(cur["id"], cur["replies"][0])


vis2.node(root["id"], label=topic, shape='box')
root = script["root"]
root["replies"].sort(key=lambda x: x["threadScore"], reverse=True)
for tlc in root["replies"][:5]: # explore top 3 threads
    build(root["id"], tlc)
    print(tlc["threadScore"])


json.dump(script, open("script.json", "w"), indent=4, sort_keys=False)

vis2.render('script', view=True)


4418.807721995921
3497.675988525242
762.2989688884827
704.5943014568893
645.8767594107447


F: Not sharing "/usr/share/icons" with sandbox: Path "/usr" is reserved by Flatpak


'script.png'


(loupe:2): GLib-GObject-CRITICAL **: 23:06:15.803: g_object_weak_unref: couldn't find weak ref 0x7f3a8df52fb0((nil))


In [3]:
from ctransformers import AutoModelForCausalLM

PATH = "/home/bren/proj/F23_Rhetors/"
MODEL = "llama-2-13b-chat.Q5_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Llama-2-13B-chat-GGUF",
    model_file=f"{PATH}{MODEL}",
    model_type="llama",
    context_length=2048,
    gpu_layers=100,
	max_new_tokens=64,
    temperature=0.9,

)

prompt = """
P1:
...
P2:
...
"""

IN = f"""
[INST] <<SYS>>
Compare the statements made by P1 and P2. Does P2 AGREE or DISAGREE? Respond with your decision.
<</SYS>>
{prompt}[/INST]
"""

OUT = str(llm(IN))
print(OUT, end='\n\n')
res = None
if 'DISAGREE' in OUT.upper(): res = False
elif 'AGREE' in OUT.upper(): res = True


print(res)

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 20560.31it/s]
Fetching 0 files: 0it [00:00, ?it/s]


Based on the statements made by P1 and P2, it is clear that they have different perspectives on the issue of censorship on left-leaning subs on Reddit.

P1 states that left-leaning subs on Reddit regularly censor and delete posts and comments that are

None


In [6]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path=f"{PATH}{MODEL}",
    n_gpu_layers=100,
    n_batch=256,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

template = """
[INST] <<SYS>>
Compare the statements made by P1 and P2. Does P2 AGREE or DISAGREE? Respond with your decision.
<</SYS>>
P1: {p1}
P2: {p2}[/INST]
"""

p1 = """
...
"""

p2 = """
...
"""

prompt = PromptTemplate(template=template, input_variables=["p1", "p2"])

llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain.run({'p1':p1, 'p2':p2})

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /home/bren/proj/F23_Rhetors/llama-2-13b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K     [  5120,  

Based on the statements made by P1 and P2, I would say that P2 DISAGREES with P1. Here's why:

P2 questions the evidence provided by P1 to support their statement, and challenges the notion that leftist moderators on Reddit censor or delete posts and comments that are contrarian to their views. P2 also shares their own experience, stating that they have seen overzealous moderation but not deletion of comments based on political views.

Furthermore, P2 highlights the difference between posting a leftist opinion in a conservative subreddit (where it will get you banned)


llama_print_timings:        load time =   17891.19 ms
llama_print_timings:      sample time =      17.95 ms /   140 runs   (    0.13 ms per token,  7800.31 tokens per second)
llama_print_timings: prompt eval time =   26023.45 ms /   372 tokens (   69.96 ms per token,    14.29 tokens per second)
llama_print_timings:        eval time =   34030.77 ms /   139 runs   (  244.83 ms per token,     4.08 tokens per second)
llama_print_timings:       total time =   60418.55 ms


"Based on the statements made by P1 and P2, I would say that P2 DISAGREES with P1. Here's why:\n\nP2 questions the evidence provided by P1 to support their statement, and challenges the notion that leftist moderators on Reddit censor or delete posts and comments that are contrarian to their views. P2 also shares their own experience, stating that they have seen overzealous moderation but not deletion of comments based on political views.\n\nFurthermore, P2 highlights the difference between posting a leftist opinion in a conservative subreddit (where it will get you banned)"