In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [92]:
import os
import json

output_folder = 'output'

all_records = []

# Traverse each folder in the output directory
for folder_name in os.listdir(output_folder):
    folder_path = os.path.join(output_folder, folder_name)
    if os.path.isdir(folder_path):
        # Find all jsonl files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        record = json.loads(line)
                        all_records.append(record)

In [93]:
def unpack_records(records):
    unpacked = []
    for rec in records:
        flat = {}
        for k, v in rec.items():
            if isinstance(v, dict):
                for sub_k, sub_v in v.items():
                    flat[f"{k}_{sub_k}"] = sub_v
            else:
                flat[k] = v
        unpacked.append(flat)
    return unpacked

unpacked_records = unpack_records(all_records)
df = pd.DataFrame(unpacked_records)

In [94]:
df.head()

Unnamed: 0,dataset_split,id,repo_name,repo_url,repo_commit_sha,repo_license,file_path,file_language,method_name,method_qualified_name,...,code_tokens,metrics_cyclomatic_complexity,metrics_n_ast_nodes,metrics_ast_depth,metrics_n_identifiers,metrics_vocab_size,metrics_n_whitespaces,metrics_n_words,metrics_nloc,metrics_token_counts
0,train,elasticsearch@264a885:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,264a8852cbd870e452846b7beb96e325b13b17ce,other,server/src/main/java/org/elasticsearch/cluster...,Java,removeExistingIndexBlocks,org.elasticsearch.cluster.routing.allocation.D...,...,"[void, removeExistingIndexBlocks, private, (, ...",3,314,15,41,70,402,91,31,213
1,train,elasticsearch@264a885:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,264a8852cbd870e452846b7beb96e325b13b17ce,other,server/src/main/java/org/elasticsearch/cluster...,Java,cleanUpRemovedNodes,org.elasticsearch.cluster.routing.allocation.D...,...,"[void, cleanUpRemovedNodes, private, static, (...",3,70,11,6,25,82,22,7,48
2,train,elasticsearch@264a885:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,264a8852cbd870e452846b7beb96e325b13b17ce,other,server/src/main/java/org/elasticsearch/cluster...,Java,isDedicatedFrozenNode,org.elasticsearch.cluster.routing.allocation.D...,...,"[boolean, isDedicatedFrozenNode, private, stat...",2,55,8,3,20,68,18,7,37
3,train,elasticsearch@264a885:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,264a8852cbd870e452846b7beb96e325b13b17ce,other,server/src/main/java/org/elasticsearch/node/No...,Java,prepareConstruction,org.elasticsearch.node.NodeConstruction#prepar...,...,"[NodeConstruction, prepareConstruction, static...",3,369,14,35,76,701,178,45,265
4,train,elasticsearch@264a885:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,264a8852cbd870e452846b7beb96e325b13b17ce,other,server/src/main/java/org/elasticsearch/node/No...,Java,injector,org.elasticsearch.node.NodeConstruction#injector,...,"[Injector, injector, (, ), {, }, return, injec...",1,14,5,1,8,17,4,3,9


In [95]:
len(df)

500000

In [96]:
df.columns

Index(['dataset_split', 'id', 'repo_name', 'repo_url', 'repo_commit_sha',
       'repo_license', 'file_path', 'file_language', 'method_name',
       'method_qualified_name', 'method_start_line', 'method_end_line',
       'method_signature', 'method_original_code', 'method_doc_comment',
       'code_tokens', 'metrics_cyclomatic_complexity', 'metrics_n_ast_nodes',
       'metrics_ast_depth', 'metrics_n_identifiers', 'metrics_vocab_size',
       'metrics_n_whitespaces', 'metrics_n_words', 'metrics_nloc',
       'metrics_token_counts'],
      dtype='object')

In [97]:
df = df.drop(columns=['repo_license'])

In [98]:
df.columns

Index(['dataset_split', 'id', 'repo_name', 'repo_url', 'repo_commit_sha',
       'file_path', 'file_language', 'method_name', 'method_qualified_name',
       'method_start_line', 'method_end_line', 'method_signature',
       'method_original_code', 'method_doc_comment', 'code_tokens',
       'metrics_cyclomatic_complexity', 'metrics_n_ast_nodes',
       'metrics_ast_depth', 'metrics_n_identifiers', 'metrics_vocab_size',
       'metrics_n_whitespaces', 'metrics_n_words', 'metrics_nloc',
       'metrics_token_counts'],
      dtype='object')

### Data Engineering

In [99]:
# Standardize repo names
df['repo_name'] = df['repo_name'].str.strip().str.lower()


# Standardize file path (forward slashes, no leading/trailing)
df['file_path'] = df['file_path'].str.replace('\\', '/')
df['file_path'] = df['file_path'].str.strip('/')

#### Cleaning Method Name & Signature

In [100]:
# Remove methods with invalid names
df = df[df['method_name'].str.strip() != '']
df = df[~df['method_name'].str.contains(r'[^\w_$]', regex=True, na=False)]
df=df[df['method_name'].str.len()>1]

# Remove excessive whitespace from method signatures
df['method_signature'] = df['method_signature'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [101]:
len(df)

499910

#### Method Length & Line Number Filtering

In [102]:
# Validate line numbers
df = df[(df['method_start_line'] > 0) & (df['method_end_line'] >= df['method_start_line'])]

# Remove extremely short or long methods
df = df[(df['metrics_nloc'] >= 3) & (df['metrics_nloc'] <= 100)]

#### Removing Codes With Parsing Errors 
This also removes empty codes or codes with only comments

In [103]:
df=df[df["metrics_cyclomatic_complexity"]>0]

In [104]:
len(df)

480229

#### Deduplication

In [105]:
duplicate_codes = df[df.duplicated(subset=["repo_name", "file_path", "method_signature"], keep=False)]
duplicate_codes= duplicate_codes.sort_values(by=["repo_name", "file_path", "method_signature"])


In [106]:
len(duplicate_codes)

383868

In [107]:
df = df.sort_values(by=["repo_commit_sha"])
df = df.drop_duplicates(subset=["repo_name", "file_path", "method_signature"], keep="last")

In [108]:
len(df)

185012

In [109]:
df.head()

Unnamed: 0,dataset_split,id,repo_name,repo_url,repo_commit_sha,file_path,file_language,method_name,method_qualified_name,method_start_line,...,code_tokens,metrics_cyclomatic_complexity,metrics_n_ast_nodes,metrics_ast_depth,metrics_n_identifiers,metrics_vocab_size,metrics_n_whitespaces,metrics_n_words,metrics_nloc,metrics_token_counts
410697,train,spring-boot@009bd44:core/spring-boot/src/main/...,spring-boot,https://github.com/spring-projects/spring-boot,009bd441f6194a2e70c4d602b248279863ad4909,core/spring-boot/src/main/java/org/springframe...,Java,ifNotEmpty,org.springframework.boot.DefaultPropertiesProp...,72,...,"[void, ifNotEmpty, public, static, (, ,, ), {,...",3,80,11,7,31,32,24,6,52
238208,train,elasticsearch@00d1041:x-pack/plugin/core/src/m...,elasticsearch,https://github.com/elastic/elasticsearch,00d104137379f6236648958fdd430fbea98351eb,x-pack/plugin/core/src/main/java/org/elasticse...,Java,loadVersionedResourceUTF8,org.elasticsearch.xpack.core.template.Resource...,17,...,"[loadVersionedResourceUTF8, static, (, ,, ,, ,...",1,62,9,7,25,29,19,3,41
238209,train,elasticsearch@00d1041:x-pack/plugin/core/src/m...,elasticsearch,https://github.com/elastic/elasticsearch,00d104137379f6236648958fdd430fbea98351eb,x-pack/plugin/core/src/main/java/org/elasticse...,Java,loadVersionedResourceUTF8,org.elasticsearch.xpack.core.template.Resource...,21,...,"[loadVersionedResourceUTF8, static, (, ,, ,, ,...",2,130,12,16,41,160,42,15,88
238210,train,elasticsearch@00d1041:x-pack/plugin/core/src/m...,elasticsearch,https://github.com/elastic/elasticsearch,00d104137379f6236648958fdd430fbea98351eb,x-pack/plugin/core/src/main/java/org/elasticse...,Java,loadResource,org.elasticsearch.xpack.core.template.Resource...,37,...,"[String, loadResource, public, static, (, ,, )...",2,113,14,11,40,111,38,8,76
147072,train,elasticsearch@01dca55:server/src/main/java/org...,elasticsearch,https://github.com/elastic/elasticsearch,01dca5501c5e17b0d4c3d428fd2ad835478477a9,server/src/main/java/org/elasticsearch/action/...,Java,checkBlock,org.elasticsearch.action.admin.indices.setting...,68,...,"[ClusterBlockException, checkBlock, protected,...",1,53,9,10,23,41,18,5,37


### Dataset Stratified Sampling

In [110]:
df['repo_name'].nunique()

6

In [111]:
def sample_from_large_repos(df, n_train=20000, n_test=5000, random_state=42):
    
    # Check distribution across repos
    print("Repository distribution:")
    print(df['repo_name'].value_counts())
    print()
    
    # Quality filtering
    quality_df = df[
        (df['metrics_cyclomatic_complexity'].between(2, 15)) &(df['metrics_n_ast_nodes'] > 10)].copy()
    
    print(f"After quality filter: {len(quality_df):,} methods")
    
    # Sample proportionally from each repo
    sampled_dfs = []
    
    for repo in quality_df['repo_name'].unique():
        repo_df = quality_df[quality_df['repo_name'] == repo]
        
        # Number of samples from this repo (proportional))
        repo_fraction = len(repo_df) / len(quality_df)
        n_samples_repo = int((n_train + n_test) * repo_fraction)
        
        # Sample from this repo
        if n_samples_repo > 0 and len(repo_df) >= n_samples_repo:
            repo_sample = repo_df.sample(n=n_samples_repo, random_state=random_state)
            sampled_dfs.append(repo_sample)
            print(f"  {repo}: {n_samples_repo:,} methods sampled")
    
    final_df = pd.concat(sampled_dfs, ignore_index=True)
    
    # Shuffle and split into train/test
    final_df = final_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Split ensuring methods from all repos in both sets
    train_df, test_df = train_test_split(final_df,test_size=n_test/(n_train + n_test),random_state=random_state,stratify=final_df['repo_name'])
    
    # splits
    train_df['dataset_split'] = 'train'
    test_df['dataset_split'] = 'test'
    result_df = pd.concat([train_df, test_df], ignore_index=True)
    
    
    print(f"\nFinal dataset:")
    print(f"  Train: {len(train_df):,} methods")
    print(f"  Test:  {len(test_df):,} methods")
    print(f"\nRepository representation in train/test:")
    for repo in result_df['repo_name'].unique():
        train_count = len(train_df[train_df['repo_name'] == repo])
        test_count = len(test_df[test_df['repo_name'] == repo])
        print(f"  {repo}: {train_count:,} train, {test_count:,} test")
    
    return result_df

In [112]:
final_df = sample_from_large_repos(df, n_train=20000, n_test=5000)
final_df.to_csv('java_methods_dataset.csv', index=False)

Repository distribution:
repo_name
spring-boot             95401
elasticsearch           85638
java-design-patterns     3807
hello-algo                103
mall                       62
advanced-java               1
Name: count, dtype: int64

After quality filter: 41,555 methods
  spring-boot: 8,489 methods sampled
  elasticsearch: 15,975 methods sampled
  java-design-patterns: 479 methods sampled
  mall: 23 methods sampled
  hello-algo: 32 methods sampled

Final dataset:
  Train: 19,998 methods
  Test:  5,000 methods

Repository representation in train/test:
  spring-boot: 6,791 train, 1,698 test
  elasticsearch: 12,780 train, 3,195 test
  java-design-patterns: 383 train, 96 test
  hello-algo: 26 train, 6 test
  mall: 18 train, 5 test
