# Project Codenet Mini Dataset Processing

Steps:
look at available problems
Identify accecpted solutions via problem list per problem in data
select and prune available solutions

grab problem description

In [38]:
import os
import pandas as pd
import sys
import re
from html_to_markdown import convert_to_markdown
from alive_progress import alive_bar
from tqdm.notebook import tqdm


# codenet_dir = "datasets/ProjectCodeNetMini"
codenet_dir = "datasets/Project_CodeNet"
data_output_dir = "data/human-written"
problem_output_dir = "data/ai-code"

In [None]:
def insert_df(df: pd.DataFrame, row: list): 
  df.loc[-1] = row
  df.index = df.index + 1  # shifting index
  df = df.sort_index()  # sorting by index
  return df

def chunks(lst, n):
  """Yield successive n-sized chunks from lst."""
  for i in range(0, len(lst), n):
      yield lst[i:i + n]

def get_accepted_solutions(problem: str):
  problem_metadata = pd.read_csv(codenet_dir + "/metadata/" + problem + ".csv")
  
  problem_c_metadata = problem_metadata[problem_metadata["language"] == "C"]
  problem_accepted_c = problem_c_metadata[problem_c_metadata["status"] == "Accepted"]
  return problem_accepted_c

def get_problem_question(problem):
  try:
    problem_desc = open("datasets/ProjectCodeNetMini/problem_descriptions/" + problem + ".html", 'r', encoding='utf-8').read()
    cleaned_problem_desc = convert_to_markdown(problem_desc)
    return cleaned_problem_desc
  except Exception as e:
    sys.stderr.write(problem + "\n")
    return None

def get_problem_data(problem: str, output_data: pd.DataFrame):
  problem_desc = get_problem_question(problem)
  if problem_desc == None:
    return output_data, None

  problem_accepted_c = get_accepted_solutions(problem)

  # print(f"Problem {problem} has {len(problem_accepted_c)} accepted C submissions")

  if(len(problem_accepted_c) <= 10):
    sample_problems = problem_accepted_c
  else:
    sample_problems = problem_accepted_c.sample(n=10, weights=None)
  
  for _, row in sample_problems.iterrows():
    submission_id = row["submission_id"]
    submission_code = open(codenet_dir + "/data/" + problem + "/C/" + str(submission_id) + ".c", 'r', encoding='utf-8').read()
    output_data = insert_df(output_data, [problem + "_" + submission_id, submission_code, 'human'])

  return output_data, problem_desc

def assemble_data(problems: list[str]):
  output_data = pd.DataFrame(columns=["id", "code", "actual label"])
  problem_data = pd.DataFrame(columns=["question", "identifier"])

  for problem in problems:
    output_data, problem_desc = get_problem_data(problem, output_data)
    if(problem_desc == None):
      continue
    problem_data = insert_df(problem_data, [problem_desc, problem])

  # print(output_data)
  # print(problem_data)
  return output_data, problem_data

In [30]:
problems_full = os.listdir(codenet_dir + "/data")
problem_split = list(chunks(problems_full, 40))
print(problem_split)


[['p00000', 'p00001', 'p00002', 'p00003', 'p00004', 'p00005', 'p00006', 'p00007', 'p00008', 'p00009', 'p00010', 'p00011', 'p00012', 'p00013', 'p00014', 'p00015', 'p00016', 'p00017', 'p00018', 'p00019', 'p00020', 'p00021', 'p00022', 'p00023', 'p00024', 'p00025', 'p00026', 'p00027', 'p00028', 'p00029', 'p00030', 'p00031', 'p00032', 'p00033', 'p00034', 'p00035', 'p00036', 'p00037', 'p00038', 'p00039'], ['p00040', 'p00041', 'p00042', 'p00043', 'p00044', 'p00045', 'p00046', 'p00047', 'p00048', 'p00049', 'p00050', 'p00051', 'p00052', 'p00053', 'p00054', 'p00055', 'p00056', 'p00057', 'p00058', 'p00059', 'p00060', 'p00061', 'p00062', 'p00063', 'p00064', 'p00065', 'p00066', 'p00067', 'p00068', 'p00069', 'p00070', 'p00071', 'p00072', 'p00073', 'p00074', 'p00075', 'p00076', 'p00077', 'p00078', 'p00079'], ['p00080', 'p00081', 'p00082', 'p00083', 'p00084', 'p00085', 'p00086', 'p00087', 'p00088', 'p00089', 'p00090', 'p00091', 'p00092', 'p00093', 'p00094', 'p00095', 'p00096', 'p00097', 'p00098', 'p00

In [31]:
problems_cleaned = [x for x in tqdm(problems_full) if len(get_accepted_solutions(x)) >= 10 and get_problem_question(x) is not None]

p02479
p02480
p02481
p02482
p02483
p02484
p02485
p02486
p02487
p02488
p02489
p02490
p02491
p02492
p02493
p02494
p02495
p02496
p02497
p02498
p02499
p02506
p02510
p02511
p02512
p02523
p02524
p02525
p02526
p02527
p02528
p02529
p02530
p02531
p02532
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3366/3366 [01:26<00:00, 38.85it/s]


In [36]:
print(f"{len(problems_full)} -> {len(problems_cleaned)}")
problem_split = list(chunks(problems_cleaned, 40))
print(len(problem_split))


3366 -> 1749
44


In [40]:
for idx, problems in tqdm(enumerate(problem_split)):
  output_data, problem_data = assemble_data(problems)
  output_data.to_pickle(data_output_dir + f"/codenet-full-{idx + 1}.code.pkl")
  problem_data.to_pickle(problem_output_dir + f"/codenet-full-{idx + 1}.pbm.pkl")


0it [00:00, ?it/s]

  0%|                                                                                                                                                                                                                                            | 0/6 [25:41<?, ?it/s]


In [None]:
# single dataset
problems = os.listdir(codenet_dir + "/data")
output_data, problem_data = assemble_data(problems)

output_data.to_pickle(data_output_dir + "/codenet.code.pkl")
problem_data.to_pickle(problem_output_dir + "/codenet-questions.pbm.pkl")

In [18]:
test_metadata = pd.read_csv(codenet_dir + "/metadata/p00002.csv")
c_data = test_metadata[test_metadata["filename_ext"] == "c"]
accepted_c = c_data[c_data['status'] == "Accepted"]
print(accepted_c)

    submission_id problem_id     user_id        date language  \
203    s834386135     p00002  u011621222  1556575141        C   
204    s713018173     p00002  u950683603  1405070347        C   
207    s507007344     p00002  u195954908  1406095867        C   
208    s686669942     p00002  u585391547  1408909602        C   
211    s956423205     p00002  u810660681  1411968723        C   
216    s440276090     p00002  u369381079  1424528453        C   
219    s239565507     p00002  u572046143  1432216544        C   
222    s325887322     p00002  u648717067  1432732758        C   
224    s004013345     p00002  u720827585  1434959820        C   
226    s062253940     p00002  u650859566  1436834847        C   
232    s712819818     p00002  u135516417  1443944516        C   
245    s567015739     p00002  u569098935  1460792331        C   
247    s144939890     p00002  u626943287  1463201133        C   
248    s452723612     p00002  u544325767  1463374247        C   
250    s642435313     p00