# Project Codenet Mini Dataset Processing

Steps:
look at available problems
Identify accecpted solutions via problem list per problem in data
select and prune available solutions

grab problem description

In [None]:
import os
import pandas as pd
import sys
import re
from html_to_markdown import convert_to_markdown
from alive_progress import alive_bar
from tqdm.notebook import tqdm


# codenet_dir = "datasets/ProjectCodeNetMini"
codenet_dir = "datasets/Project_CodeNet"
data_output_dir = "data/human-written"
problem_output_dir = "data/ai-code"

In [None]:
def insert_df(df: pd.DataFrame, row: list): 
  df.loc[-1] = row
  df.index = df.index + 1  # shifting index
  df = df.sort_index()  # sorting by index
  return df

def chunks(lst, n):
  """Yield successive n-sized chunks from lst."""
  for i in range(0, len(lst), n):
      yield lst[i:i + n]

def get_accepted_solutions(problem: str):
  problem_metadata = pd.read_csv(codenet_dir + "/metadata/" + problem + ".csv")
  
  problem_c_metadata = problem_metadata[problem_metadata["language"] == "C"]
  problem_accepted_c = problem_c_metadata[problem_c_metadata["status"] == "Accepted"]
  return problem_accepted_c

def get_problem_question(problem):
  try:
    problem_desc = open("datasets/ProjectCodeNetMini/problem_descriptions/" + problem + ".html", 'r', encoding='utf-8').read()
    cleaned_problem_desc = convert_to_markdown(problem_desc)
    return cleaned_problem_desc
  except Exception as e:
    sys.stderr.write(problem + "\n")
    return None

def get_problem_data(problem: str, output_data: pd.DataFrame):
  problem_desc = get_problem_question(problem)
  if problem_desc == None:
    return output_data, None

  problem_accepted_c = get_accepted_solutions(problem)

  # print(f"Problem {problem} has {len(problem_accepted_c)} accepted C submissions")

  if(len(problem_accepted_c) <= 10):
    sample_problems = problem_accepted_c
  else:
    sample_problems = problem_accepted_c.sample(n=10, weights=None)
  
  for _, row in sample_problems.iterrows():
    submission_id = row["submission_id"]
    submission_code = open(codenet_dir + "/data/" + problem + "/C/" + str(submission_id) + ".c", 'r', encoding='utf-8').read()
    output_data = insert_df(output_data, [problem + "_" + submission_id, submission_code, 'human'])

  return output_data, problem_desc

def assemble_data(problems: list[str]):
  output_data = pd.DataFrame(columns=["id", "code", "actual label"])
  problem_data = pd.DataFrame(columns=["question", "identifier"])

  for problem in problems:
    output_data, problem_desc = get_problem_data(problem, output_data)
    if(problem_desc == None):
      continue
    problem_data = insert_df(problem_data, [problem_desc, problem])

  # print(output_data)
  # print(problem_data)
  return output_data, problem_data

In [None]:
all_code = os.listdir('../bin/')
identifier = "dan"
output_name = "Daniel-Code"
person_code = [code for code in all_code if code.endswith('.c')  and code.find(identifier) != -1]
print(person_code)

output_data = pd.DataFrame(columns=["id", "code", "actual label"])
for code in person_code:
  file_code = open("../bin/" + code, 'r', encoding='utf-8').read()
  output_data = insert_df(output_data, [code, file_code, 'human'])

output_data.to_pickle('../data/prepared/' + output_name + '.code.pkl')


In [None]:
problems_full = os.listdir(codenet_dir + "/data")
problem_split = list(chunks(problems_full, 40))
print(problem_split)


In [None]:
problems_cleaned = [x for x in tqdm(problems_full) if len(get_accepted_solutions(x)) >= 10 and get_problem_question(x) is not None]

In [None]:
print(f"{len(problems_full)} -> {len(problems_cleaned)}")
problem_split = list(chunks(problems_cleaned, 40))
print(len(problem_split))


In [None]:
for idx, problems in tqdm(enumerate(problem_split)):
  output_data, problem_data = assemble_data(problems)
  output_data.to_pickle(data_output_dir + f"/codenet-full-{idx + 1}.code.pkl")
  problem_data.to_pickle(problem_output_dir + f"/codenet-full-{idx + 1}.pbm.pkl")


In [None]:
# single dataset
problems = os.listdir(codenet_dir + "/data")
output_data, problem_data = assemble_data(problems)

output_data.to_pickle(data_output_dir + "/codenet.code.pkl")
problem_data.to_pickle(problem_output_dir + "/codenet-questions.pbm.pkl")

In [None]:
test_metadata = pd.read_csv(codenet_dir + "/metadata/p00002.csv")
c_data = test_metadata[test_metadata["filename_ext"] == "c"]
accepted_c = c_data[c_data['status'] == "Accepted"]
print(accepted_c)