In [1]:
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the above DatasetDict
code_qual_dataset = DatasetDict.load_from_disk('../data/hf_code_qual_dataset_v1')
print(code_qual_dataset)

DatasetDict({
    train: Dataset({
        features: ['problem_id', 'problem', 'submission_id', 'submission', 'label'],
        num_rows: 1650
    })
    validation: Dataset({
        features: ['problem_id', 'problem', 'submission_id', 'submission', 'label'],
        num_rows: 300
    })
    test: Dataset({
        features: ['problem_id', 'problem', 'submission_id', 'submission', 'label'],
        num_rows: 300
    })
})


In [3]:
cq_train = code_qual_dataset['train'].to_pandas()
cq_validation = code_qual_dataset['validation'].to_pandas()
cq_test = code_qual_dataset['test'].to_pandas()
# concatenate the train, validation and test sets
cq_all = pd.concat([cq_train, cq_validation, cq_test])
print(cq_all.shape)

(2250, 5)


In [5]:
cq_all_high = cq_all[cq_all['label'] == 2]
cq_all_medium = cq_all[cq_all['label'] == 1]
cq_all_low = cq_all[cq_all['label'] == 0]
sample_high = cq_all_high.sample(75, random_state=42)
sample_medium = cq_all_medium.sample(75, random_state=42)
sample_low = cq_all_low.sample(75, random_state=42)

# merge sample_high and sample_low and shuffle the rows
sample_df = pd.concat([sample_high, sample_medium, sample_low])
final = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

final.to_csv("../data/sample_human_main_v1.csv", index=False)

final

Unnamed: 0,problem_id,problem,submission_id,submission,label
0,p03838,Snuke has a calculator. It has a display and t...,s814078068,"x,y=map(int,input().split())\n\nans=10**12\nif...",2
1,p03108,There are N islands and M bridges.\nThe i-th b...,s489040369,"class UnionFind:\n def __init__(self, n):\n...",0
2,p02789,Takahashi is participating in a programming co...,s945302680,"A, B = map(int, input().split())\n\nif (A == B...",1
3,p02729,"We have N+M balls, each of which has an intege...",s656845952,"from scipy.special import comb\nN,M = input()....",0
4,p03253,You are given positive integers N and M.\nHow ...,s396776378,import sys\nsys.setrecursionlimit(10 ** 8)\nMO...,1
...,...,...,...,...,...
220,p02853,We held two competitions: Coding Contest and R...,s309428626,"money = [0, 300000, 200000, 100000]\na,b = map...",1
221,p03946,"There are N towns located in a line, convenien...",s446886761,import sys\nfrom collections import Counter\n\...,2
222,p02972,There are N empty boxes arranged in a row from...,s400237734,"N = int(input())\nAlist = list(map(int,input()...",1
223,p03494,There are N positive integers written on a bla...,s588074493,num = int(input())\nnu_list = [int(v) for v in...,0


In [6]:
# load human annotated dataset
h0t0 = pd.read_csv('../data/sample_human_main_v1_a0_t0.csv')
h1t0 = pd.read_csv('../data/sample_human_main_v1_a1_t0.csv')
h2t0 = pd.read_csv('../data/sample_human_main_v1_a2_t0.csv')
h3t0 = pd.read_csv('../data/sample_human_main_v1_a3_t0.csv')
h4t0 = pd.read_csv('../data/sample_human_main_v1_a4_t0.csv')

h0t1 = pd.read_csv('../data/sample_human_main_v1_a0_t1.csv')
h1t1 = pd.read_csv('../data/sample_human_main_v1_a1_t1.csv')
h2t1 = pd.read_csv('../data/sample_human_main_v1_a2_t1.csv')
h3t1 = pd.read_csv('../data/sample_human_main_v1_a3_t1.csv')
h4t1 = pd.read_csv('../data/sample_human_main_v1_a4_t1.csv')

In [10]:
# calculate inter-annotator agreement
from sklearn.metrics import cohen_kappa_score

# calculate kappa for each pair of annotators
h0_h1_t0 = cohen_kappa_score(h0t0['human_label'], h1t0['human_label'])
h1_h2_t0 = cohen_kappa_score(h1t0['human_label'], h2t0['human_label'])
h2_h3_t0 = cohen_kappa_score(h2t0['human_label'], h3t0['human_label'])
h3_h4_t0 = cohen_kappa_score(h3t0['human_label'], h4t0['human_label'])
h4_h0_t0 = cohen_kappa_score(h4t0['human_label'], h0t0['human_label'])

h0_h1_t1 = cohen_kappa_score(h0t1['human_label'], h1t1['human_label'])
h1_h2_t1 = cohen_kappa_score(h1t1['human_label'], h2t1['human_label'])
h2_h3_t1 = cohen_kappa_score(h2t1['human_label'], h3t1['human_label'])
h3_h4_t1 = cohen_kappa_score(h3t1['human_label'], h4t1['human_label'])
h4_h0_t1 = cohen_kappa_score(h4t1['human_label'], h0t1['human_label'])

h0_t0_t1 = cohen_kappa_score(h0t0['human_label'], h0t1['human_label'])
h1_t0_t1 = cohen_kappa_score(h1t0['human_label'], h1t1['human_label'])
h2_t0_t1 = cohen_kappa_score(h2t0['human_label'], h2t1['human_label'])
h3_t0_t1 = cohen_kappa_score(h3t0['human_label'], h3t1['human_label'])
h4_t0_t1 = cohen_kappa_score(h4t0['human_label'], h4t1['human_label'])

h0_t0_actual = cohen_kappa_score(h0t0['human_label'], final['label'])
h1_t0_actual = cohen_kappa_score(h1t0['human_label'], final['label'])
h2_t0_actual = cohen_kappa_score(h2t0['human_label'], final['label'])
h3_t0_actual = cohen_kappa_score(h3t0['human_label'], final['label'])
h4_t0_actual = cohen_kappa_score(h4t0['human_label'], final['label'])

h0_t1_actual = cohen_kappa_score(h0t1['human_label'], final['label'])
h1_t1_actual = cohen_kappa_score(h1t1['human_label'], final['label'])
h2_t1_actual = cohen_kappa_score(h2t1['human_label'], final['label'])
h3_t1_actual = cohen_kappa_score(h3t1['human_label'], final['label'])
h4_t1_actual = cohen_kappa_score(h4t1['human_label'], final['label'])


# print the kappa values between annotators
print("Kappa between h0 and h1: ", (h0_h1_t0 + h0_h1_t1)/2)
print("Kappa between h1 and h2: ", (h1_h2_t0 + h1_h2_t1)/2)
print("Kappa between h2 and h3: ", (h2_h3_t0 + h2_h3_t1)/2)
print("Kappa between h3 and h4: ", (h3_h4_t0 + h3_h4_t1)/2)
print("Kappa between h4 and h0: ", (h3_h4_t0 + h3_h4_t1)/2)
print("Kappa between h0 and h4: ", (h4_h0_t0 + h4_h0_t1)/2)

# print self-kappa values
print("Kappa for h0 for t0 and t1: ", h0_t0_t1)
print("Kappa for h1 for t0 and t1: ", h1_t0_t1)
print("Kappa for h2 for t0 and t1: ", h2_t0_t1)
print("Kappa for h3 for t0 and t1: ", h3_t0_t1)
print("Kappa for h4 for t0 and t1: ", h4_t0_t1)

# print the kappa values between annotators and the actual labels
print("Kappa between h0 and actual labels: ", (h0_t0_actual + h0_t1_actual)/2)
print("Kappa between h1 and actual labels: ", (h1_t0_actual + h1_t1_actual)/2)
print("Kappa between h2 and actual labels: ", (h2_t0_actual + h2_t1_actual)/2)
print("Kappa between h3 and actual labels: ", (h3_t0_actual + h3_t1_actual)/2)
print("Kappa between h4 and actual labels: ", (h4_t0_actual + h4_t1_actual)/2)

# calculate the average kappa between the annotators and the actual labels
avg_kappa = (h0_t0_actual + h0_t1_actual + h1_t0_actual + h1_t1_actual + h2_t0_actual + h2_t1_actual + h3_t0_actual + h3_t1_actual + h4_t0_actual + h4_t1_actual)/10
print("Average Kappa between annotators and actual labels: ", avg_kappa)

Kappa between h0 and h1:  0.673378835466741
Kappa between h1 and h2:  0.6231199861302938
Kappa between h2 and h3:  0.6002255731909464
Kappa between h3 and h4:  0.6066992068609085
Kappa between h4 and h0:  0.6066992068609085
Kappa between h0 and h4:  0.6565480427046264
Kappa for h0 for t0 and t1:  0.713435223031811
Kappa for h1 for t0 and t1:  0.6263677608753669
Kappa for h2 for t0 and t1:  0.5801865837405598
Kappa for h3 for t0 and t1:  0.6063050505649634
Kappa for h4 for t0 and t1:  0.5869970690114575
Kappa between h0 and actual labels:  0.8533333333333333
Kappa between h1 and actual labels:  0.7966666666666666
Kappa between h2 and actual labels:  0.7566666666666666
Kappa between h3 and actual labels:  0.8033333333333333
Kappa between h4 and actual labels:  0.7833333333333333
Average Kappa between annotators and actual labels:  0.7986666666666666
