In [1]:
import pandas as pd
import os
import csv
import random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import statistics

In [2]:
df_all = pd.read_csv("./2015_100_skill_builders_main_problems.csv")
len(df_all)

708631

In [3]:
df_filtered = df_all[(df_all['correct'] == 0.0) | (df_all['correct'] == 1.0)]
df_filtered = df_filtered.drop(columns=['log_id'])

In [4]:
# qid (sid) mapper
questions = list(df_filtered['sequence_id'])
questions = list(set(questions))

In [5]:
questions.sort()
qnum = len(questions)
snum = qnum
qmap = {}
for i in range(qnum):
    qmap[questions[i]] = i+1

In [6]:
qmap

{5898: 1,
 5918: 2,
 5945: 3,
 5961: 4,
 5962: 5,
 5965: 6,
 5968: 7,
 5969: 8,
 5976: 9,
 6009: 10,
 6018: 11,
 6022: 12,
 6039: 13,
 6402: 14,
 6465: 15,
 6473: 16,
 6849: 17,
 6851: 18,
 6891: 19,
 6910: 20,
 6913: 21,
 6921: 22,
 6937: 23,
 6943: 24,
 7012: 25,
 7014: 26,
 7020: 27,
 7035: 28,
 7155: 29,
 7156: 30,
 7157: 31,
 7158: 32,
 7159: 33,
 7160: 34,
 7165: 35,
 7166: 36,
 7167: 37,
 7179: 38,
 7182: 39,
 7183: 40,
 7184: 41,
 7185: 42,
 7192: 43,
 7195: 44,
 7196: 45,
 8585: 46,
 8741: 47,
 8864: 48,
 8928: 49,
 8946: 50,
 8949: 51,
 9180: 52,
 9222: 53,
 9244: 54,
 9245: 55,
 9423: 56,
 9424: 57,
 9428: 58,
 10195: 59,
 10264: 60,
 10265: 61,
 10293: 62,
 10597: 63,
 10763: 64,
 10765: 65,
 10767: 66,
 10833: 67,
 11829: 68,
 11831: 69,
 11836: 70,
 11893: 71,
 11898: 72,
 11899: 73,
 12450: 74,
 13731: 75,
 13935: 76,
 14168: 77,
 14211: 78,
 14247: 79,
 14442: 80,
 15528: 81,
 19362: 82,
 19610: 83,
 21257: 84,
 24173: 85,
 26902: 86,
 31260: 87,
 31277: 88,
 31825: 89,

In [7]:
qs = list(df_filtered["sequence_id"])
qs = [qmap[x] for x in qs]

df_filtered["sequence_id"] = qs
df_filtered.iloc[:10]

Unnamed: 0,user_id,sequence_id,correct
0,50121,26,0.0
1,50121,26,1.0
2,50121,26,1.0
3,50121,26,1.0
4,50964,26,1.0
5,50964,26,1.0
6,50964,26,0.0
7,50964,26,1.0
8,50964,26,1.0
9,50964,26,1.0


In [8]:
# corrects
corrects = list(df_filtered['correct'])
corrects = list(map(int, corrects))
df_filtered['correct'] = corrects

# rename columns
df_filtered = df_filtered.rename(columns={'sequence_id':'q_id', 'correct':'correctness'})

df_filtered.iloc[:10]

Unnamed: 0,user_id,q_id,correctness
0,50121,26,0
1,50121,26,1
2,50121,26,1
3,50121,26,1
4,50964,26,1
5,50964,26,1
6,50964,26,0
7,50964,26,1
8,50964,26,1
9,50964,26,1


In [9]:
def load_gc_cnt_mapper(path):
    d_gc_cnt = {}
    with open(path, "r") as f:
        lines = f.readlines()
        for line in lines:
            k, gc, cnt = line.strip().split(",")
            d_gc_cnt[k] = [gc, cnt]
    return d_gc_cnt

In [10]:
# correctness rate

# overall
print(f'overall: {sum(corrects) / len(corrects)}')

qgc_dict = load_gc_cnt_mapper("qid_to_gc_cnt.csv")

# avg
gcs = []

for k in qgc_dict:
    gc = float(qgc_dict[k][0])
    gcs.append(gc)

print(f'avg per Q: {sum(gcs) / len(gcs)}')
print(f'max: {max(gcs)}')
print(f'min: {min(gcs)}')
print(f'med: {statistics.median(gcs)}')
print(f'std: {statistics.stdev(gcs)}')

overall: 0.7317611410337218
avg per Q: 0.7512829629239137
max: 0.9442231075697212
min: 0.5086448004068141
med: 0.745126732083469
std: 0.10238295027482883


In [11]:
print(qmap)

{5898: 1, 5918: 2, 5945: 3, 5961: 4, 5962: 5, 5965: 6, 5968: 7, 5969: 8, 5976: 9, 6009: 10, 6018: 11, 6022: 12, 6039: 13, 6402: 14, 6465: 15, 6473: 16, 6849: 17, 6851: 18, 6891: 19, 6910: 20, 6913: 21, 6921: 22, 6937: 23, 6943: 24, 7012: 25, 7014: 26, 7020: 27, 7035: 28, 7155: 29, 7156: 30, 7157: 31, 7158: 32, 7159: 33, 7160: 34, 7165: 35, 7166: 36, 7167: 37, 7179: 38, 7182: 39, 7183: 40, 7184: 41, 7185: 42, 7192: 43, 7195: 44, 7196: 45, 8585: 46, 8741: 47, 8864: 48, 8928: 49, 8946: 50, 8949: 51, 9180: 52, 9222: 53, 9244: 54, 9245: 55, 9423: 56, 9424: 57, 9428: 58, 10195: 59, 10264: 60, 10265: 61, 10293: 62, 10597: 63, 10763: 64, 10765: 65, 10767: 66, 10833: 67, 11829: 68, 11831: 69, 11836: 70, 11893: 71, 11898: 72, 11899: 73, 12450: 74, 13731: 75, 13935: 76, 14168: 77, 14211: 78, 14247: 79, 14442: 80, 15528: 81, 19362: 82, 19610: 83, 21257: 84, 24173: 85, 26902: 86, 31260: 87, 31277: 88, 31825: 89, 37002: 90, 37055: 91, 37374: 92, 37570: 93, 37876: 94, 37980: 95, 39162: 96, 39885: 97,

In [12]:
qid_tot_cnt = {k: 0 for k in range(1, qnum + 1)}
qid_cor_cnt = {k: 0 for k in range(1, qnum + 1)}

In [None]:
def load_qs(path):
    d = {}
    with open(path, "r") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(",")
            q = int(line[0])
            ss = line[1].split(";")
            ss = list(map(int, ss))
            d[q] = ss
    return d


def save_qs(d, path):
    with open(path, "w") as f:
        for k in d:
            v = d[k]
            v = [str(x) for x in v]
            v = ";".join(v)
            f.write(f"{k},{v}\n")

In [None]:
def make_smaller_dataset_by_erasing_questions(df, erase_rate=0.5):
    random.seed(0)
    remain_qnum = int(qnum * (1 - erase_rate))
    df_erased = df[df.q_id <= remain_qnum]

    