In [1]:
%%time

import pandas as pd
from tqdm import tqdm
from bigtree import Node
from common.storage.azure_file_storage import AzureFileStorageAdapter

file_system: AzureFileStorageAdapter = AzureFileStorageAdapter('data').get_file_storage()

CPU times: total: 6.3 s
Wall time: 9.43 s


In [6]:
%%time

submission_data: pd.DataFrame = pd.read_parquet("data/text/AskReddit.parquet", engine='pyarrow', filesystem=file_system)
submission_data['submission_id'] = submission_data['id']
display(submission_data)

Unnamed: 0,id,subreddit,title,selftext,author,score,upvote_ratio,ups,subreddit_id,num_comments,permalink,submission_id
0,zquyhv,AskReddit,what was ruined by rich people?,,Lattethecoffeaddict,42217.0,0.85,42217.0,t5_2qh1i,34788.0,/r/AskReddit/comments/zquyhv/what_was_ruined_b...,zquyhv
1,udrbbs,AskReddit,"People of reddit, what is one thing your famil...",,Belle_Artist_Jade,30022.0,0.91,30022.0,t5_2qh1i,10835.0,/r/AskReddit/comments/udrbbs/people_of_reddit_...,udrbbs
2,11artla,AskReddit,What do you think of the parents that kick the...,,zeg685,29103.0,0.88,29103.0,t5_2qh1i,12824.0,/r/AskReddit/comments/11artla/what_do_you_thin...,11artla
3,103a4x5,AskReddit,What's a sound you heard when you were young t...,,Ms_Mosa,26439.0,0.92,26439.0,t5_2qh1i,20322.0,/r/AskReddit/comments/103a4x5/whats_a_sound_yo...,103a4x5
4,vezgus,AskReddit,if you could have sex with any fictional chara...,,vatisitgrandpapa,22774.0,0.76,22774.0,t5_2qh1i,17355.0,/r/AskReddit/comments/vezgus/if_you_could_have...,vezgus
...,...,...,...,...,...,...,...,...,...,...,...,...
21122,ykvpd0,AskReddit,What is a movie you would not recommend at all?,,Open-Yak6678,0.0,0.33,0.0,t5_2qh1i,12.0,/r/AskReddit/comments/ykvpd0/what_is_a_movie_y...,ykvpd0
21123,ykvpj5,AskReddit,What is the only bad habit you want to change ...,,Jason_explorer101,0.0,0.50,0.0,t5_2qh1i,5.0,/r/AskReddit/comments/ykvpj5/what_is_the_only_...,ykvpj5
21124,ykvpl6,AskReddit,Therapist of reddit. What is your reason to be...,,ChiefsFan_sjc,0.0,0.44,0.0,t5_2qh1i,8.0,/r/AskReddit/comments/ykvpl6/therapist_of_redd...,ykvpl6
21125,yz6kje,AskReddit,Which Pokémon did you get ? Scarlet or violet ...,,elizabethepic,0.0,0.50,0.0,t5_2qh1i,12.0,/r/AskReddit/comments/yz6kje/which_pokémon_did...,yz6kje


CPU times: total: 359 ms
Wall time: 1.03 s


In [2]:
%%time

comment_data: pd.DataFrame = pd.read_parquet("data/text/AskReddit.comments.parquet", engine='pyarrow',filesystem=file_system)
display(comment_data)

Unnamed: 0,id,parent_id,link_id,submission_id,body,author,score
0,j2gyyz3,t3_100aze7,t3_100aze7,100aze7,Male: Maynard from Tool. \nFemale: Prob Whitne...,editor_of_the_beast,2
1,j2h3240,t3_100aze7,t3_100aze7,100aze7,"Mike Patton, his range, creativity, and techni...",TheParallax2,1
2,j2h9rdl,t3_100aze7,t3_100aze7,100aze7,Male: Philip Quast.\n\nFemale: Celine Dion.,Mysterious_Drama2772,1
3,j2gzi7n,t1_j2gyyz3,t3_100aze7,100aze7,"My #1 there (Whitney). Lot of lists of seen, s...",ryan0585,1
4,j2gswjy,t1_j2gqnti,t3_100aze7,100aze7,Freddie Mercury is 100% one of the best vocali...,ryan0585,2
...,...,...,...,...,...,...,...
240570,j2eka6p,t3_zzzrfn,t3_zzzrfn,zzzrfn,What kind of balls are we talking?,SlimSour,2
240571,j2euenj,t3_zzzrfn,t3_zzzrfn,zzzrfn,I hate that I can’t dislike the guy without pe...,Ebvardh-Boss,2
240572,j2ekhgq,t3_zzzrfn,t3_zzzrfn,zzzrfn,Well you know that Stormy Daniels won't be fou...,Cnnlgns,1
240573,j2ekiv2,t3_zzzrfn,t3_zzzrfn,zzzrfn,His wife gave one of them back?,Rikkrishub,1


CPU times: total: 2.17 s
Wall time: 3.89 s


In [3]:
from bigtree import Node, tree_to_dot


def build_tree(df: pd.DataFrame, parent_id: str, submission_id: str, depth: int = 0) -> Node:
	if depth > 10:
		return None
	node = Node(parent_id)
	children = df[df['parent_id'] == parent_id]
	for index, row in children.iterrows():
		child = build_tree(df, row['id'], submission_id, depth + 1)
		if child is not None:
			node.add_child(child)
	return node

In [4]:
def map_submission(x: object, lookup) -> str:
	subreddit: str = x['subreddit'] if x['subreddit'] is not None else ""
	title: str = x['title'] if x['title'] is not None else ""
	text: str = x['selftext'] if x['selftext'] is not None else ""
	template = f"<|start|>{subreddit}<|text|>{title}"
	parent_id_local = x['parent_id']
	link_id_local = x['link_id']
	body_local = x['body']
	submission_id = x['submission_id']
	if text != "":
		template += f"{text}"

	is_submission_reply = parent_id_local == link_id_local
	if is_submission_reply:
		template += f"<|comment|>{body_local}"
	else:
		searchable_id = parent_id_local.split("_")[1]
		while searchable_id != submission_id:
			parent_comment = lookup[lookup['id_comment'] == searchable_id]
			if len(parent_comment) == 0:
				break
			else:
				parent_replies = parent_comment['body'].values[0]
				searchable_id = parent_comment['parent_id'].values[0].split("_")[1]
				template += f"<|reply|>{parent_replies}"
		pbar.update(1)
	pbar.update(1)
	return template + "<|end|>"

In [7]:
display(submission_data.head(10))

Unnamed: 0,id,subreddit,title,selftext,author,score,upvote_ratio,ups,subreddit_id,num_comments,permalink,submission_id
0,zquyhv,AskReddit,what was ruined by rich people?,,Lattethecoffeaddict,42217.0,0.85,42217.0,t5_2qh1i,34788.0,/r/AskReddit/comments/zquyhv/what_was_ruined_b...,zquyhv
1,udrbbs,AskReddit,"People of reddit, what is one thing your famil...",,Belle_Artist_Jade,30022.0,0.91,30022.0,t5_2qh1i,10835.0,/r/AskReddit/comments/udrbbs/people_of_reddit_...,udrbbs
2,11artla,AskReddit,What do you think of the parents that kick the...,,zeg685,29103.0,0.88,29103.0,t5_2qh1i,12824.0,/r/AskReddit/comments/11artla/what_do_you_thin...,11artla
3,103a4x5,AskReddit,What's a sound you heard when you were young t...,,Ms_Mosa,26439.0,0.92,26439.0,t5_2qh1i,20322.0,/r/AskReddit/comments/103a4x5/whats_a_sound_yo...,103a4x5
4,vezgus,AskReddit,if you could have sex with any fictional chara...,,vatisitgrandpapa,22774.0,0.76,22774.0,t5_2qh1i,17355.0,/r/AskReddit/comments/vezgus/if_you_could_have...,vezgus
5,xx37vs,AskReddit,What was the dumbest thing you did while losin...,,EmptyTailor6721,14043.0,0.89,14043.0,t5_2qh1i,5311.0,/r/AskReddit/comments/xx37vs/what_was_the_dumb...,xx37vs
6,ujjx84,AskReddit,What gets men in the mood for sex?,,BeforeImeturfather,13816.0,0.78,13816.0,t5_2qh1i,7386.0,/r/AskReddit/comments/ujjx84/what_gets_men_in_...,ujjx84
7,ukl24g,AskReddit,What is the one thing you should always buy new?,,Idkewokorsomthing,13401.0,0.93,13401.0,t5_2qh1i,9338.0,/r/AskReddit/comments/ukl24g/what_is_the_one_t...,ukl24g
8,wyqi1s,AskReddit,What invention would you want to see in your l...,,dramafan1,11169.0,0.95,11169.0,t5_2qh1i,7774.0,/r/AskReddit/comments/wyqi1s/what_invention_wo...,wyqi1s
9,vtb9m9,AskReddit,What does America do better than Europe?,,Einsex,10487.0,0.78,10487.0,t5_2qh1i,16593.0,/r/AskReddit/comments/vtb9m9/what_does_america...,vtb9m9


In [10]:
%%time
from tqdm import notebook
import dask.dataframe as dd

merged = pd.merge(comment_data, submission_data, on=["submission_id"], how="left", suffixes=("_comment", "_submission"))
lookup_df = merged.copy()

temp = merged.copy().head(1000)

with notebook.tqdm(total=len(merged), desc="Foo") as pbar:
	ddf = dd.from_pandas(temp, npartitions=32)
	merged['template'] = ddf.apply(lambda x: map_submission(x, lookup_df), meta=('str', object), axis=1).compute()

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
display(merged['template'].head(100))

Foo:   0%|          | 0/240575 [00:00<?, ?it/s]

0                                                                                                                                                                                                                                                                                                                                                                                                    <|start|>AskReddit<|text|>Ignoring performance flair and popularity, who is the best male or female vocalist of all time?<|comment|>Male: Maynard from Tool. \nFemale: Prob Whitney Houston.<|end|>
1                                                                                                                                                                                                                                                                                            <|start|>AskReddit<|text|>Ignoring performance flair and popularity, who is the best male or female vocalist of all time?<|comment

CPU times: total: 34.5 s
Wall time: 36.9 s


In [16]:
%%time

def my_fucking_use_case(x: object) -> str:
	template = "<|startoftext|>"
	title = x['title']
	self_text = x['selftext']
	daddy_id = x['parent_id']
	fucking_reposter_id = x['link_id']

	if daddy_id == fucking_reposter_id:
		template += f"<|context|>{title}"
		return template
	if self_text:
		template += f"<|context|>{title} - {self_text}"
	else:
		template += f"<|context|>{title}"
	return template

CPU times: total: 0 ns
Wall time: 1 ms


In [15]:
import bigtree
%%time

forest = []
my_fucking_business_ids = list(submission_data.submission_id.unique())



zquyhv
udrbbs
11artla
103a4x5
vezgus
xx37vs
ujjx84
ukl24g
wyqi1s
vtb9m9
xgkifv
y4zoje
vsisp6
vram9i
v23lad
x63pmm
uhchly
yus4r8
z2iu0o
zk65zb
vp43im
xuhbk4
u69axz
108ukpf
xwnjao
ty9i6m
109pewj
v90nq3
uaraq9
z3dwxp
xfp41f
w0jo85
xyb98n
xd5jqh
uczr5x
1183ye0
11w1vhx
1273lf0
xanku2
v7yf6p
wlf789
10hi371
xbgk75
11arzfq
wjqur2
upox4b
u9axj3
ukya17
x3ozkw
zwlmds
wg4obr
ujjsuw
v3sn9g
uyg1cm
10tgr0t
wetr1x
xnl3mc
10ak8wn
wn2bk8
v7i4xy
wn2dix
xpeo1z
ttmz3s
vrqyqw
uoztr4
u44iha
xuwkex
wjqw31
ykvhh7
uhcj85
10ny3vz
10n555e
v189su
10uwpho
x8n8st
u2mjh9
x20yce
v4wrci
ydgy5x
ztobk0
wn23yd
vp42ka
xkz5vc
uo9k1n
yog2ih
u7sxv3
tv5rhm
1202a01
xybd7h
y3baoj
w8nf2l
10w22f8
vbwcvi
10jw7c7
yr8iv0
10d49ov
10jwbci
xheqoh
xkz3y0
vqkj7v
v6c4ad
zbm81f
x4ilhf
1223pae
xkz80n
x0c7az
100b0h3
xi9tsu
zz6vlt
wnuioz
tuf7pn
wadguu
v0ixh4
xd5dvc
10gmnf3
yhr2le
101xd9t
ukye10
vfog6u
w0jqh5
ua1y6t
uu69m9
10rfxvj
v6nplm
w0jpj4
wetsjp
wadid6
10455mg
xnlbzs
x3ox75
zz6tbr
ya5z6s
y8fnef
w2w3pg
x6xsiy
10jw6xb
wgiozt
v47hs0
z8xzcu
w

In [42]:
from tqdm import notebook

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

processed_lines = []
records = merged.to_dict(orient='records')
for record in records:
	parent_id = record['parent_id']
	link_id = record['link_id']
	if parent_id == link_id:  # top level comment
		continue
	else:
		continue
	# if id_type == 't3':
	#     continue
	# print(record)
	# else:
	#     print(record['template'])
	# processed_lines.append(record['template'])

Unnamed: 0,id_comment,parent_id,link_id,submission_id,body,author_comment,score_comment,id_submission,subreddit,title,selftext,author_submission,score_submission,upvote_ratio,ups,subreddit_id,num_comments,permalink,template
0,j2gyyz3,t3_100aze7,t3_100aze7,100aze7,Male: Maynard from Tool. \nFemale: Prob Whitney Houston.,editor_of_the_beast,2,100aze7,AskReddit,"Ignoring performance flair and popularity, who is the best male or female vocalist of all time?",,ryan0585,2.0,0.75,2.0,t5_2qh1i,5.0,/r/AskReddit/comments/100aze7/ignoring_performance_flair_and_popularity_who_is/,"<|start|>AskReddit<|text|>Ignoring performance flair and popularity, who is the best male or female vocalist of all time?<|comment|>Male: Maynard from Tool. \nFemale: Prob Whitney Houston."
1,j2h3240,t3_100aze7,t3_100aze7,100aze7,"Mike Patton, his range, creativity, and techniques are out of this world. \n\nNina Simone, just a personal favorite and her vibrato sends shivers down my spine.",TheParallax2,1,100aze7,AskReddit,"Ignoring performance flair and popularity, who is the best male or female vocalist of all time?",,ryan0585,2.0,0.75,2.0,t5_2qh1i,5.0,/r/AskReddit/comments/100aze7/ignoring_performance_flair_and_popularity_who_is/,"<|start|>AskReddit<|text|>Ignoring performance flair and popularity, who is the best male or female vocalist of all time?<|comment|>Mike Patton, his range, creativity, and techniques are out of this world. \n\nNina Simone, just a personal favorite and her vibrato sends shivers down my spine."
2,j2h9rdl,t3_100aze7,t3_100aze7,100aze7,Male: Philip Quast.\n\nFemale: Celine Dion.,Mysterious_Drama2772,1,100aze7,AskReddit,"Ignoring performance flair and popularity, who is the best male or female vocalist of all time?",,ryan0585,2.0,0.75,2.0,t5_2qh1i,5.0,/r/AskReddit/comments/100aze7/ignoring_performance_flair_and_popularity_who_is/,"<|start|>AskReddit<|text|>Ignoring performance flair and popularity, who is the best male or female vocalist of all time?<|comment|>Male: Philip Quast.\n\nFemale: Celine Dion."
3,j2gzi7n,t1_j2gyyz3,t3_100aze7,100aze7,"My #1 there (Whitney). Lot of lists of seen, she's not even in the top 10...",ryan0585,1,100aze7,AskReddit,"Ignoring performance flair and popularity, who is the best male or female vocalist of all time?",,ryan0585,2.0,0.75,2.0,t5_2qh1i,5.0,/r/AskReddit/comments/100aze7/ignoring_performance_flair_and_popularity_who_is/,"<|reply|>My #1 there (Whitney). Lot of lists of seen, she's not even in the top 10..."
4,j2gswjy,t1_j2gqnti,t3_100aze7,100aze7,Freddie Mercury is 100% one of the best vocalists of the past century.,ryan0585,2,100aze7,AskReddit,"Ignoring performance flair and popularity, who is the best male or female vocalist of all time?",,ryan0585,2.0,0.75,2.0,t5_2qh1i,5.0,/r/AskReddit/comments/100aze7/ignoring_performance_flair_and_popularity_who_is/,<|reply|>Freddie Mercury is 100% one of the best vocalists of the past century.


In [None]:
# extant_data.to_parquet("data/text/AskReddit.comments.parquet", engine='pyarrow', filesystem=file_system)

In [None]:
# %%time
# foo = pandas.read_parquet("data/text/AskReddit.comments.parquet", engine='pyarrow', filesystem=file_system)
#
# display(foo)