In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
WORKING_DIR = '.'

dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

educ_rounds = list(OrderedDict.fromkeys([re.search(r'(blog/entry/[0-9]+)', idx).groups()[0] for idx in dataset_educ.index]))
div_rounds = list(OrderedDict.fromkeys([re.search(r'(blog/entry/[0-9]+)', idx).groups()[0] for idx in dataset_div.index]))

In [2]:
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

div_groups = list(dataset_div.groupby(lambda k : k.split("?")[0]))


In [3]:
from collections import defaultdict
for g in train_groups:
   print(g[0])
   labels = defaultdict(int)

   for l in g[1]['label']:
      labels[l]+=1
   print(labels)

https://codeforces.com/blog/entry/101790
defaultdict(<class 'int'>, {'Irrelevant': 16, 'HintExp': 15, 'QuestionExp': 12, 'SolutionExp': 3, 'SubQuestion': 1, 'AlgoExp': 1})
https://codeforces.com/blog/entry/103835
defaultdict(<class 'int'>, {'Irrelevant': 38, 'SubQuestion': 10, 'FixingExp': 6, 'QuestionExp': 5, 'HintExp': 8, 'SolutionExp': 5})
https://codeforces.com/blog/entry/103163
defaultdict(<class 'int'>, {'SubQuestion': 6, 'FixingExp': 5, 'Irrelevant': 35, 'HintExp': 14, 'QuestionExp': 13, 'SolutionExp': 3, 'AlgoExp': 3})
https://codeforces.com/blog/entry/106805
defaultdict(<class 'int'>, {'TimeComExp': 5, 'HintExp': 14, 'QuestionExp': 8, 'Irrelevant': 41, 'SolutionExp': 4, 'SubQuestion': 2, 'FixingExp': 4, 'TestExp': 1})
https://codeforces.com/blog/entry/101161
defaultdict(<class 'int'>, {'Irrelevant': 12, 'QuestionExp': 4, 'HintExp': 3, 'SubQuestion': 1, 'FixingExp': 1, 'TestQuestion': 2})
https://codeforces.com/blog/entry/99136
defaultdict(<class 'int'>, {'Irrelevant': 18, 'Hin

In [4]:
import re
import os
import bs4
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')

def preprocess(text, problem):
  t_codes = code_regex.sub(' (code) ', text)
  t_link = regex_link_ful.sub(f' (link to problem {problem}) ', t_codes)
  bs = bs4.BeautifulSoup(t_link)
  
  return bs.text

def get_graph_root_nodes(round):
    graph = {}
    node_indegree = {}

    for _, comment in round.iterrows():
        graph[comment.id] = []
        node_indegree[comment.id] = 0

    for _, comment in round.iterrows():
        if(comment.father_id in graph):
            graph[comment.father_id].append(comment.id)
            node_indegree[comment.id]+=1

    root_nodes = []

    for node, indegree in node_indegree.items():
        if(indegree == 0):
            root_nodes.append(node)
    return root_nodes, graph

def dfs_pretty_print(node, graph, round, level = 0):
    comment = round[round.id == node].iloc[0]
    father_id = -1
    if(comment.father_id in graph):
        father_id = comment.father_id
    id = comment.id
    preprocessed_text = preprocess(comment.text, comment.problem)

    txt_input = f'({father_id, id}) <<===>> {preprocessed_text} [<<<new_com>>>]\n'
    txt_output = f'({father_id, id}) <<===>> {comment.label} [<<<new_com>>>]\n'

    if(len(graph[node]) == 0):
        return txt_input, txt_output
    else:
        for child_node in graph[node]:
            input, output = dfs_pretty_print(child_node, graph, round, level + 1)
            txt_input = txt_input + input
            txt_output = txt_output + output
        return txt_input, txt_output
    

for idx, round in enumerate(test_groups):
    PROMPT_FOLDER =f'prompt_generation/educ/test/{idx}'
    os.makedirs(PROMPT_FOLDER, exist_ok=True)
    
    root_nodes, graph = get_graph_root_nodes(round[1])
    for node in root_nodes:
        txt_input,txt_output = dfs_pretty_print(node, graph, round[1], 0)
        with open(f'{PROMPT_FOLDER}/{node}_input.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_input)
        with open(f'{PROMPT_FOLDER}/{node}_output.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_output)
        #print(txt)
        #print('#'*30)
for idx, round in enumerate(div_groups):
    PROMPT_FOLDER =f'prompt_generation/div/{idx}'
    os.makedirs(PROMPT_FOLDER, exist_ok=True)
    
    root_nodes, graph = get_graph_root_nodes(round[1])
    for node in root_nodes:
        txt_input,txt_output = dfs_pretty_print(node, graph, round[1], 0)
        with open(f'{PROMPT_FOLDER}/{node}_input.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_input)
        with open(f'{PROMPT_FOLDER}/{node}_output.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_output)
        #print(txt)
        #print('#'*30)