# Project Codenet Mini Dataset Processing

Steps:
look at available problems
Identify accecpted solutions via problem list per problem in data
select and prune available solutions

grab problem description

In [44]:
import os
import pandas as pd
import sys
import re
from html_to_markdown import convert_to_markdown

codenet_dir = "datasets/ProjectCodeNetMini"
data_output_dir = "data/human-written"
problem_output_dir = "data/ai-code"

In [45]:
def insert_df(df: pd.DataFrame, row: list): 
  df.loc[-1] = row
  df.index = df.index + 1  # shifting index
  df = df.sort_index()  # sorting by index
  return df


def get_problem_data(problem: str, output_data: pd.DataFrame):
  problem_metadata = pd.read_csv(codenet_dir + "/metadata/" + problem + ".csv")
  
  problem_c_metadata = problem_metadata[problem_metadata["language"] == "C"]
  problem_accepted_c = problem_c_metadata[problem_c_metadata["status"] == "Accepted"]

  print(f"Problem {problem} has {len(problem_accepted_c)} accepted C submissions")

  sample_problems = problem_accepted_c.sample(n=10, weights=None)
  for _, row in sample_problems.iterrows():
    submission_id = row["submission_id"]
    submission_code = open(codenet_dir + "/data/" + problem + "/C/" + str(submission_id) + ".c", 'r', encoding='UTF-8').read()
    output_data = insert_df(output_data, [problem + "_" + submission_id, submission_code, 'human'])
  
  problem_desc = open(codenet_dir + "/problem_descriptions/" + problem + ".html").read()
  cleaned_problem_desc = convert_to_markdown(problem_desc)

  
  return output_data, cleaned_problem_desc

In [46]:
problems = os.listdir(codenet_dir + "/data")
output_data = pd.DataFrame(columns=["id", "code", "label"])
problem_data = pd.DataFrame(columns=["question", "identifier"])

for problem in problems:
  output_data, problem_desc = get_problem_data(problem, output_data)
  problem_data = insert_df(problem_data, [problem_desc, problem])

print(output_data)
print(problem_data)

Problem p00002 has 49 accepted C submissions
Problem p02256 has 89 accepted C submissions
Problem p02400 has 72 accepted C submissions
Problem p02407 has 79 accepted C submissions
Problem p02971 has 80 accepted C submissions
Problem p03001 has 49 accepted C submissions
Problem p03242 has 121 accepted C submissions
Problem p04030 has 83 accepted C submissions
                   id                                               code  \
0   p04030_s178847378  #include <stdio.h>\n#include <string.h>\nint m...   
1   p04030_s194098957  #include <stdio.h>\n#include <string.h>\n\nint...   
2   p04030_s074357900  #include <stdio.h>\n#include <string.h>\n\nint...   
3   p04030_s195426041  #include <stdio.h>\n#include <string.h>\n\nint...   
4   p04030_s988693248  /* ex9_1\n   moka223711 */\n\n#include <stdio....   
..                ...                                                ...   
75  p00002_s341182640  #include<stdio.h>\n\nint main(){\n\tint a,b,c,...   
76  p00002_s719522901  #include

In [47]:
output_data.to_pickle(data_output_dir + "/codenet.code.pkl")
problem_data.to_pickle(problem_output_dir + "/codenet-questions.pbm.pkl")

In [18]:
test_metadata = pd.read_csv(codenet_dir + "/metadata/p00002.csv")
c_data = test_metadata[test_metadata["filename_ext"] == "c"]
accepted_c = c_data[c_data['status'] == "Accepted"]
print(accepted_c)

    submission_id problem_id     user_id        date language  \
203    s834386135     p00002  u011621222  1556575141        C   
204    s713018173     p00002  u950683603  1405070347        C   
207    s507007344     p00002  u195954908  1406095867        C   
208    s686669942     p00002  u585391547  1408909602        C   
211    s956423205     p00002  u810660681  1411968723        C   
216    s440276090     p00002  u369381079  1424528453        C   
219    s239565507     p00002  u572046143  1432216544        C   
222    s325887322     p00002  u648717067  1432732758        C   
224    s004013345     p00002  u720827585  1434959820        C   
226    s062253940     p00002  u650859566  1436834847        C   
232    s712819818     p00002  u135516417  1443944516        C   
245    s567015739     p00002  u569098935  1460792331        C   
247    s144939890     p00002  u626943287  1463201133        C   
248    s452723612     p00002  u544325767  1463374247        C   
250    s642435313     p00