In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
WORKING_DIR = '.'

dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [2]:
import re
from sklearn.metrics import classification_report
import os
import difflib

all_labels = ['SolutionExp', 'HintExp', 'AlgoExp','TimeComExp', 'FixingExp', 'TestExp', 'QuestionExp', 'SubQuestion', 'TestQuestion', 'Irrelevant']

def print_result(type_tree, round_type):
    predicted_labels = {}
    truth_labels = {}

    for root, folders, files in os.walk(f'prompt_generation/{round_type}'):
        if(len(files) > 0):
            round_id = root.split('\\')[-1]
            for f in files:
                if(f.endswith('output.txt')):
                    with open(f"{root}/{f}", "r") as fp:
                        output_text = fp.read()
                    with open(f"chatgpt_predictions/{type_tree}/{round_type}/{round_id}/{f.split('_')[0] + '_prediction.txt'}", "r") as fp:
                        prediction_text = fp.read()
                
                    for comment in output_text.split('[<<<new_com>>>]')[:-1]:
                        comment_id = re.findall(r'\(+-*[0-9]+,[\s]\(*([0-9]+)\)+', comment)[0]
                        label = re.findall('|'.join(all_labels), comment)[0]

                        truth_labels[comment_id] = label

                    for comment in prediction_text.split('[<<<new_com>>>]')[:-1]:
                        comment_id = re.findall(r'\(+-*[0-9]+,[\s]\(*([0-9]+)\)+', comment)
                        if(len(comment_id) == 0):
                            continue
                        else:
                            comment_id = comment_id[0]
                        label = comment.split('<<===>>')[1]
                        label = difflib.get_close_matches(label, all_labels, n=1, cutoff=0)[0]
                        predicted_labels[comment_id] = label
                        
    print(f"Total comments: {len(truth_labels)}")
    print(f"Missing comments: {len(set(truth_labels.keys()) - set(predicted_labels.keys()))}")
    print(f"Extra comments: {len(set(predicted_labels.keys()) - set(truth_labels.keys()))}")
    common_comment_ids = list(set(truth_labels.keys()).intersection(set(predicted_labels.keys())))

    common_predicted_labels = [predicted_labels[id] if predicted_labels[id] == 'Irrelevant' else 'Relevant' for id in common_comment_ids]
    common_truth_labels = [truth_labels[id] if truth_labels[id] == 'Irrelevant' else 'Relevant' for id in common_comment_ids]

    print(classification_report(common_truth_labels, common_predicted_labels))
# predicted_labels = ['Relevant' if l!= 'Irrelevant' else l for l in predicted_labels]
# truth_labels = ['Relevant' if l!= 'Irrelevant' else l for l in truth_labels]
# print(classification_report(truth_labels, predicted_labels))

In [3]:
print_result('no_tree', 'educ/test')

Total comments: 160
Missing comments: 0
Extra comments: 0
              precision    recall  f1-score   support

  Irrelevant       0.90      0.59      0.72        74
    Relevant       0.73      0.94      0.82        86

    accuracy                           0.78       160
   macro avg       0.81      0.77      0.77       160
weighted avg       0.81      0.78      0.77       160



In [4]:
print_result('no_tree', 'div')

Total comments: 504
Missing comments: 0
Extra comments: 0
              precision    recall  f1-score   support

  Irrelevant       0.97      0.64      0.77       393
    Relevant       0.42      0.94      0.58       111

    accuracy                           0.70       504
   macro avg       0.70      0.79      0.68       504
weighted avg       0.85      0.70      0.73       504



In [5]:
print_result('tree', 'educ/test')

Total comments: 160
Missing comments: 2
Extra comments: 2
              precision    recall  f1-score   support

  Irrelevant       0.91      0.53      0.67        74
    Relevant       0.70      0.95      0.80        84

    accuracy                           0.75       158
   macro avg       0.80      0.74      0.74       158
weighted avg       0.79      0.75      0.74       158



In [6]:
print_result('tree', 'div')

Total comments: 504
Missing comments: 7
Extra comments: 2
              precision    recall  f1-score   support

  Irrelevant       0.99      0.56      0.71       387
    Relevant       0.39      0.98      0.55       110

    accuracy                           0.65       497
   macro avg       0.69      0.77      0.63       497
weighted avg       0.86      0.65      0.68       497

