In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
WORKING_DIR = '.'

dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')


educ_rounds = list(OrderedDict.fromkeys([re.search(r'(blog/entry/[0-9]+)', idx).groups()[0] for idx in dataset_educ.index]))
div_rounds = list(OrderedDict.fromkeys([re.search(r'(blog/entry/[0-9]+)', idx).groups()[0] for idx in dataset_div.index]))

In [2]:
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]


In [3]:
from collections import defaultdict
for g in train_groups:
   print(g[0])
   labels = defaultdict(int)

   for l in g[1]['label']:
      labels[l]+=1
   print(labels)

https://codeforces.com/blog/entry/101790
defaultdict(<class 'int'>, {'Irrelevant': 16, 'HintExp': 15, 'QuestionExp': 12, 'SolutionExp': 3, 'SubQuestion': 1, 'AlgoExp': 1})
https://codeforces.com/blog/entry/103835
defaultdict(<class 'int'>, {'Irrelevant': 38, 'SubQuestion': 10, 'FixingExp': 6, 'QuestionExp': 5, 'HintExp': 8, 'SolutionExp': 5})
https://codeforces.com/blog/entry/103163
defaultdict(<class 'int'>, {'SubQuestion': 6, 'FixingExp': 5, 'Irrelevant': 35, 'HintExp': 14, 'QuestionExp': 13, 'SolutionExp': 3, 'AlgoExp': 3})
https://codeforces.com/blog/entry/106805
defaultdict(<class 'int'>, {'TimeComExp': 5, 'HintExp': 14, 'QuestionExp': 8, 'Irrelevant': 41, 'SolutionExp': 4, 'SubQuestion': 2, 'FixingExp': 4, 'TestExp': 1})
https://codeforces.com/blog/entry/101161
defaultdict(<class 'int'>, {'Irrelevant': 12, 'QuestionExp': 4, 'HintExp': 3, 'SubQuestion': 1, 'FixingExp': 1, 'TestQuestion': 2})
https://codeforces.com/blog/entry/99136
defaultdict(<class 'int'>, {'Irrelevant': 18, 'Hin

In [6]:
import re
import os

REFERENCE_ROUND_INDEX = 5

def get_graph_root_nodes(round):
    graph = {}
    node_indegree = {}

    for _, comment in round.iterrows():
        graph[comment.id] = []
        node_indegree[comment.id] = 0

    for _, comment in round.iterrows():
        if(comment.father_id in graph):
            graph[comment.father_id].append(comment.id)
            node_indegree[comment.id]+=1

    root_nodes = []

    for node, indegree in node_indegree.items():
        if(indegree == 0):
            root_nodes.append(node)
    return root_nodes, graph

def dfs_pretty_print(node, graph, round, level = 0):
    comment = round[round.id == node].iloc[0]
    father_id = -1
    if(comment.father_id in graph):
        father_id = comment.father_id
    id = comment.id
    preprocessed_text = comment.text
    preprocessed_text = re.sub(r'<pre>(\s|.)*?<\/pre>', ' <code> ', preprocessed_text)
    preprocessed_text = re.sub(r'<a href.*\/contest/.*/submission/.*<\/a>', ' <submission link> ', preprocessed_text)
    text= preprocessed_text
    txt_input = f'({father_id, id}) => {text}\n'
    txt_output = f'({father_id, id}) => {comment.label}\n'

    if(len(graph[node]) == 0):
        return txt_input, txt_output
    else:
        for child_node in graph[node]:
            input, output = dfs_pretty_print(child_node, graph, round, level + 1)
            txt_input = txt_input + input
            txt_output = txt_output + output
        return txt_input, txt_output
    

for idx, test_round in enumerate(test_groups):
    PROMPT_FOLDER =f'prompt_generation/educ/test/{idx}'
    os.makedirs(PROMPT_FOLDER, exist_ok=True)

    reference_root_nodes, reference_graph = get_graph_root_nodes(train_groups[REFERENCE_ROUND_INDEX][1])

    for node in reference_root_nodes:
        txt_input, txt_output = dfs_pretty_print(node, reference_graph, train_groups[REFERENCE_ROUND_INDEX][1], 0)
        with open(f'{PROMPT_FOLDER}/ref_{node}_input.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_input)
        with open(f'{PROMPT_FOLDER}/ref_{node}_output.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_output)
        #print(txt)
        #print('#'*30)
    
    root_nodes, graph = get_graph_root_nodes(test_round[1])
    for node in root_nodes:
        txt_input,txt_output = dfs_pretty_print(node, graph, test_round[1], 0)
        with open(f'{PROMPT_FOLDER}/{node}_input.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_input)
        with open(f'{PROMPT_FOLDER}/{node}_output.txt', 'w', encoding='utf-8') as fp:
            fp.write(txt_output)
        #print(txt)
        #print('#'*30)

In [51]:
def dfs_pretty_print(node, level = 0):
    comment = dataset_round[dataset_round.id == node].iloc[0]
    father_id = -1
    if(comment.father_id in graph):
        father_id = comment.father_id
    id = comment.id
    txt = f'({father_id, id}) => {comment.label}\n'
    if(len(graph[node]) == 0):
        return txt
    else:
        for child_node in graph[node]:
            txt = txt + dfs_pretty_print(child_node, level + 1)
        return txt

for node in root_nodes:
    txt_input = dfs_pretty_print(node, 0)
    with open(f'{PROMPT_FOLDER}/{node}_labels.txt', 'w') as fp:
        fp.write(txt_input)
    print(txt_input)
    print('#'*30)

((-1, 889303)) => Irrelevant
((889303, 889318)) => Irrelevant
((889303, 901155)) => Irrelevant

##############################
((-1, 889304)) => QuestionExp
((889304, 889308)) => HintExp
((889308, 889314)) => Irrelevant
((889308, 889342)) => QuestionExp
((889342, 889528)) => HintExp
((889342, 889598)) => Irrelevant

##############################
((-1, 889315)) => Irrelevant

##############################
((-1, 889499)) => SubQuestion
((889499, 889537)) => FixingExp
((889537, 889554)) => Irrelevant
((889537, 896394)) => Irrelevant
((889537, 897491)) => QuestionExp

##############################
((-1, 889560)) => Irrelevant
((889560, 890386)) => Irrelevant

##############################
((-1, 889571)) => QuestionExp
((889571, 889745)) => Irrelevant

##############################
((-1, 889597)) => Irrelevant
((889597, 922830)) => SubQuestion
((922830, 922863)) => Irrelevant
((922863, 922895)) => Irrelevant

##############################
((-1, 889602)) => HintExp
((889602, 892309)) =

In [52]:
from sklearn.metrics import classification_report

CHATGPT_35_LABELLING = """((-1, 898006)) => Irrelevant
((-1, 898022)) => SolutionExp
((898022, 898032)) => SolutionExp
((-1, 898047)) => Irrelevant
((898047, 899199)) => Irrelevant
((-1, 898148)) => QuestionExp
((-1, 898189)) => Irrelevant
((-1, 898194)) => SubQuestion
((898194, 899136)) => SubQuestion
((899136, 899247)) => SubQuestion
((-1, 898210)) => QuestionExp
((898210, 898507)) => HintExp
((898507, 898570)) => TextExp
((-1, 898695)) => QuestionExp
((898695, 899133)) => FixingExp
((899133, 899293)) => TextExp
((-1, 898850)) => Irrelevant
((898850, 907744)) => AlgoExp
((-1, 903085)) => Irrelevant
((-1, 917278)) => TestQuestion
((-1, 943179)) => TestQuestion
((943179, 943930)) => HintExp
((943930, 952208)) => TextExp
"""

lines = CHATGPT_35_LABELLING.split('\n')
labels_re = re.compile('([-]*\d+), (\d+).* (SolutionExp|HintExp|AlgoExp|TimeComExp|FixingExp|TextExp|QuestionExp|SubQuestion|TestQuestion|Irrelevant)')

predicted_labels = {}
for line in lines:
    match = labels_re.search(line)

    if(match != None):
        predicted_labels[int(match.group(2))] = match.group(3)

for label in truth_labels.keys():
    if(label not in predicted_labels):
        predicted_labels[label] = 'Irrelevant'

for label in predicted_labels.keys():
    if label not in truth_labels:
        del predicted_labels[label]

truth_labels_list = [label for _, label in sorted(list(truth_labels.items()), key = lambda x : x[0])]
predicted_labels_list = [label for _, label in sorted(list(predicted_labels.items()), key = lambda x : x[0])]

truth_labels_list = [l if l == 'Irrelevant' else 'Relevant' for l in truth_labels_list]
predicted_labels_list = [l if l == 'Irrelevant' else 'Relevant' for l in predicted_labels_list]

assert len(truth_labels_list) == len(predicted_labels_list), f'Number truth labels: {len(truth_labels_list)} Number predicted labels: {len(predicted_labels_list)}'

print(classification_report(truth_labels_list, predicted_labels_list))

RuntimeError: dictionary changed size during iteration