## Java method naming script

## Setup colab environment

In [None]:
# Mount dDrive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Create project directory structure

In [None]:
# Create project folder inside Google Drive
!mkdir -p /content/drive/MyDrive/method_naming_project
!mkdir -p /content/drive/MyDrive/method_naming_project/data
!mkdir -p /content/drive/MyDrive/method_naming_project/models
!mkdir -p /content/drive/MyDrive/method_naming_project/datasets
!mkdir -p /content/drive/MyDrive/method_naming_project/scripts
!mkdir -p /content/drive/MyDrive/method_naming_project/output

# Use this as project root
import os
os.environ['PROJECT_ROOT'] = '/content/drive/MyDrive/method_naming_project'
os.chdir(os.environ['PROJECT_ROOT'])

print("Project root:", os.getcwd())


Project root: /content/drive/MyDrive/method_naming_project


## Step1. Creating the Dataset of Java Methods

### Step 1.1: Install required dependencies

In [None]:

# install_dependencies.py
#!/usr/bin/env python3
"""
Installing all requirement dependencies
"""
import subprocess
import sys

def install_packages():
    packages = [
        'tree-sitter',
        'tree_sitter_java',
        'gitpython',
        'pandas',
        'tqdm',
        'transformers',
        'torch',
        'datasets'
    ]

    for package in packages:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

if __name__ == "__main__":
    install_packages()



Installing tree-sitter...
Installing tree_sitter_java...
Installing gitpython...
Installing pandas...
Installing tqdm...
Installing transformers...
Installing torch...
Installing datasets...


### Step 1.2: Load repository list from download CSV

In [4]:
# Reading the downloaded CSV dataset from GitHub and preview it
import os
import pandas as pd

os.environ['PROJECT_ROOT'] = '/content/drive/MyDrive/method_naming_project'
os.chdir(os.environ['PROJECT_ROOT'])

PROJECT_ROOT = os.environ["PROJECT_ROOT"]
DATA_DIR     = os.path.join(PROJECT_ROOT, "data")
DATASETS_DIR = os.path.join(PROJECT_ROOT, "datasets")
MODELS_DIR   = os.path.join(PROJECT_ROOT, "models")
SCRIPTS_DIR  = os.path.join(PROJECT_ROOT, "scripts")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DATASETS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(SCRIPTS_DIR, exist_ok=True)

# The path to the CSV file downloaded from seart-ghs
repos_csv_path = os.path.join(DATASETS_DIR, "github_repos.csv")

# Check the list name and data sample before starting build a load repo method
repos_df = pd.read_csv(repos_csv_path, engine="python")
print("Repos list previewÔºö")
print("="*20)
print(repos_df.head())
print("="*20)
print("List nameÔºö", repos_df.columns.tolist())


Repos list previewÔºö
    id                          name  isFork  commits  branches  releases  \
0   56  dustin/java-memcached-client   False      979         6         0   
1   61     davidb/scala-maven-plugin   False     1246         3         0   
2   82                   tcurdt/jdeb   False     1237         7         0   
3   97             rictic/code_swarm   False      381         4         0   
4  165            tcurdt/jdependency   False      654         3         2   

   forks mainLanguage defaultBranch                          license  ...  \
0    429         Java        master                      MIT License  ...   
1    157         Java        master                    The Unlicense  ...   
2    312         Java        master               Apache License 2.0  ...   
3     62         Java        master  GNU General Public License v3.0  ...   
4     28         Java        master               Apache License 2.0  ...   

                                             metrics

List nameÔºö ['id', 'name', 'isFork', 'commits', 'branches', 'releases', 'forks', 'mainLanguage', 'defaultBranch', 'license', 'homepage', 'watchers', 'stargazers', 'contributors', 'size', 'createdAt', 'pushedAt', 'updatedAt', 'totalIssues', 'openIssues', 'totalPullRequests', 'openPullRequests', 'blankLines', 'codeLines', 'commentLines', 'metrics', 'lastCommit', 'lastCommitSHA', 'hasWiki', 'isArchived', 'isDisabled', 'isLocked', 'languages', 'labels', 'topics']



### Step 1.3: Filter repositories based on assignment criteria

In [None]:
# Build a method to load valid repos url

import pandas as pd
import json

def load_repository_list(csv_file):
    """Load repository list exported by SEART-GHS, filter it, and construct clone URLs."""

    print(f"Loading repository CSV: {csv_file}")
    df = pd.read_csv(csv_file)

    print("CSV Columns:", df.columns.tolist())

    # ---------- Validate required column ----------
    if "name" not in df.columns:
        raise ValueError("CSV file must contain column 'name' (repo name like 'owner/repo').")

    # Make a copy so the original dataframe is untouched
    df = df.copy()

    # ---------- Language Filtering ----------
    # First priority: mainLanguage
    if "mainLanguage" in df.columns:
        df = df[df["mainLanguage"].astype(str).str.lower() == "java"]

    # Second option: languages column (may contain JSON dict)
    elif "languages" in df.columns:
        def contains_java(val):
            try:
                if isinstance(val, str):
                    val = json.loads(val)  # convert JSON str to dict
                return isinstance(val, dict) and ("Java" in val)
            except:
                return False
        df = df[df["languages"].apply(contains_java)]

    # ---------- Commit Filtering ----------
    if "commits" in df.columns:
        df = df[df["commits"].astype(int) >= 100]

    # ---------- Contributors Filtering ----------
    if "contributors" in df.columns:
        df = df[df["contributors"].astype(int) >= 10]

    # ---------- Non-Fork Filtering ----------
    if "isFork" in df.columns:
        df = df[df["isFork"] == False]

    # Reset index after filtering
    df = df.reset_index(drop=True)

    # ---------- Construct GitHub Clone URLs ----------
    df["repo_url"] = "https://github.com/" + df["name"].astype(str).str.strip() + ".git"

    print(f"Number of valid repos: {len(df)}")
    print(df[["name", "repo_url"]].head())

    # Return list of clone URLs
    return df["repo_url"].tolist()


In [None]:
# Check the load_repository_list function return
load_repository_list(repos_csv_path);

Loading repository CSV: /content/drive/MyDrive/method_naming_project/datasets/github_repos.csv
CSV Columns: ['id', 'name', 'isFork', 'commits', 'branches', 'releases', 'forks', 'mainLanguage', 'defaultBranch', 'license', 'homepage', 'watchers', 'stargazers', 'contributors', 'size', 'createdAt', 'pushedAt', 'updatedAt', 'totalIssues', 'openIssues', 'totalPullRequests', 'openPullRequests', 'blankLines', 'codeLines', 'commentLines', 'metrics', 'lastCommit', 'lastCommitSHA', 'hasWiki', 'isArchived', 'isDisabled', 'isLocked', 'languages', 'labels', 'topics']
Number of valid repos: 14786
                           name  \
0  dustin/java-memcached-client   
1     davidb/scala-maven-plugin   
2                   tcurdt/jdeb   
3             rictic/code_swarm   
4            tcurdt/jdependency   

                                            repo_url  
0  https://github.com/dustin/java-memcached-clien...  
1   https://github.com/davidb/scala-maven-plugin.git  
2                 https://github.co

### Step 1.4: Clone Valid GitHub Repository list

In [None]:
# Build Clone repo function
import subprocess

def clone_repository(url, target_dir):
    repo_name = url.split("/")[-1].replace(".git", "")
    dst = os.path.join(target_dir, repo_name)

    if os.path.exists(dst):
        print(f"[Skip] already exists ‚Üí {repo_name}")
        return dst

    print(f"[Clone] {url}")

    try:
        result = subprocess.run(
            ["git", "clone", "--depth", "1", url, dst],
            capture_output=True,
            text=True,
            timeout=300
        )
    except subprocess.TimeoutExpired:
        print(f"[Timeout] {url}")
        return None

    if result.returncode != 0:
        print(f"[Error] cloning failed: {result.stderr[:200]}")
        return None

    print(f"[Ok] cloned successfully‚Üí {repo_name}")
    return dst


### Step 1.5: Setup Tree-sitter for Java parsingParsing Java methods

In [None]:
from tree_sitter import Language, Parser
import tree_sitter_java as tsjava
import graphviz
import re



# def extract_methods_from_file(filepath):
#     with open(filepath, "r", encoding="utf8", errors="ignore") as f:
#         code = f.read()
#     tree = parser.parse(code.encode("utf8"))
#     root = tree.root_node

#     methods = []

#     def visit(node):
#         if node.type == "method_declaration":
#             # method name
#             name_node = node.child_by_field_name("name")
#             if name_node:
#                 method_name = code[name_node.start_byte:name_node.end_byte]
#                 method_body = code[node.start_byte:node.end_byte]
#                 methods.append((method_name, method_body))

#         for child in node.children:
#             visit(child)

#     visit(root)
#     return methods

JAVA_LANGUAGE = Language(tsjava.language())
parser = Parser(JAVA_LANGUAGE)

def extract_methods_with_treesitter(java_file_path):
    """Extract Java methods from a file using Tree-Sitter and tree-sitter-java"""
    methods = []

    try:
        with open(java_file_path, "r", encoding="utf-8", errors="ignore") as f:
            code = f.read()

        tree = parser.parse(bytes(code, "utf8"))
        root = tree.root_node

        # ------------------------------------------------------------------
        # Method mode types in Tree-sitter JavaÔºö
        # - method_declaration
        # - constructor_declaration
        # ------------------------------------------------------------------
        method_types = ["method_declaration", "constructor_declaration"]

        def get_text(node):
            return code[node.start_byte:node.end_byte]

        # Traverse AST
        def traverse(node):
            if node.type in method_types:
                name_node = node.child_by_field_name("name")
                body_node = node.child_by_field_name("body")

                if name_node and body_node:
                    method_name = get_text(name_node)
                    full_method = get_text(node)

                    methods.append({
                        "name": method_name,
                        "body": full_method,
                        "file": java_file_path,
                    })

            for child in node.children:
                traverse(child)

        traverse(root)

    except Exception as e:
        print(f"[Tree-sitter Error] {e}")

    return methods


In [1]:
# Test Tree-sitter extraction
sample_code = '''
public class HelloWorld {
    public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
}
'''

In [None]:
# Write the sample Java code to a file
with open("sample_input.java", "w") as file:
    file.write(sample_code)

methods = extract_methods_from_file("sample_input.java")

print(f"Total methods extracted: {len(methods)}\n")

for i, (name, body) in enumerate(methods):
    print(f"Method {i+1}: {name}")
    print("Body:")
    print(body)
    print("-" * 50)


Total methods extracted: 1

Method 1: main
Body:
public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
--------------------------------------------------


In [None]:
# Clean up test file
os.remove('sample_test.java')

### Step 1.6 Process a repository

In [None]:
def process_repository(repo_path, max_files=200):
    """Extract methods from a repository"""
    methods = []

    # FindJava files
    java_files = []
    for root, dirs, files in os.walk(repo_path):
      # Skip test directories
      if 'test' in root.lower():
          continue

      for file in files:
          if file.endswith('.java'):
              java_files.append(os.path.join(root, file))

    print(f"üîç Found {len(java_files)} Java filesÔºåprocessing first {min(max_files, len(java_files))} ")

    # Processing files
    for java_file in java_files[:max_files]:
      file_methods = extract_methods_from_file(java_file)
      for name, body in file_methods:
          methods.append({
              "name": name,
              "body": body,
              "file": java_file
          })
      # methods.extend(file_methods)

      # if self.total_methods + len(methods) >= self.target_methods:
      #     break

    return methods

### Step 1.7: Data cleaning functions

In [None]:
    # Duplicating
    import hashlib

    def deduplicate_methods(methods):
        """Remove duplicate methods based on content hash"""
        seen = set()
        unique_methods = []

        for method in methods:
            method_hash = hashlib.md5(method['body'].encode()).hexdigest()
            if method_hash not in seen:
                seen.add(method_hash)
                unique_methods.append(method)

        return unique_methods

In [None]:
# Filter long token methods
from transformers import AutoTokenizer

def filter_long_methods(methods, max_tokens=256):
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B")
    filtered = []

    for method in methods:
        # tokens = tokenizer.encode(method['body'])
        tokens = tokenizer.encode(method['body'], add_special_tokens=False)

        if len(tokens) <= max_tokens:
            filtered.append(method)

    return filtered

### Step 1.8: Split and save dataset

In [None]:
import json
import os
from sklearn.model_selection import train_test_split

def save_jsonl(data, path):
  """Save data in JSON format"""
    with open(path, "w", encoding="utf8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


def split_and_save(methods, output_dir, test_ratio=0.2):
  """Split dataset and save to files"""
    train_set, test_set = train_test_split(
        methods,
        test_size=test_ratio,
        random_state=42,
        shuffle=True
    )

    save_jsonl(train_set, os.path.join(output_dir, "train_dataset.jsonl"))
    save_jsonl(test_set, os.path.join(output_dir, "test_dataset.jsonl"))

    print(f"Saved {len(train_set)} train and {len(test_set)} test")




### Step 1.9: Run the complete data mining pipeline

In [None]:
class GitHubJavaMiner:
    def __init__(self, csv_file, output_dir="data"):
        self.csv_file = csv_file
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.repos_dir = os.path.join(output_dir, "repos")
        os.makedirs(self.repos_dir, exist_ok=True)
        self.methods = []

    def run(self):
        urls = load_repository_list(self.csv_file)

        for url in urls:
            repo_path = clone_repository(url, self.repos_dir)
            if repo_path:
                self.methods.extend(collect_methods_from_repo(repo_path))
            if len(self.methods) >= 50000:
                break

        print("Raw methods:", len(self.methods))
        self.methods = deduplicate_methods(self.methods)
        print("After dedup:", len(self.methods))
        self.methods = filter_by_token_limit(self.methods)
        print("After token ‚â§256:", len(self.methods))

        split_and_save(self.methods, self.output_dir)

        print("DONE!")


### Step 1.10: Creating Java method mining python script (github_miner.py)

In [5]:
print("üì¶ Creating data mining script...")

data_mining_script = '''# scripts/github_miner.py
"""
GitHub Java Method Miner - Assignment 1 Step 1
Extract Java methods from repositories listed in SEART-GHS CSV to create <method_body, method_name> pairs
"""

import pandas as pd
import subprocess
import os
import json
import re
import shutil
from tqdm import tqdm
import hashlib
from datetime import datetime
from sklearn.model_selection import train_test_split

# Install required packages for Tree-sitter
import subprocess
import sys

def install_package(package_name):
    """Install a Python package if not available"""
    try:
        __import__(package_name.split('-')[0].replace('_', ''))
        return True
    except ImportError:
        print(f"Installing {package_name}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            return True
        except:
            print(f"Failed to install {package_name}")
            return False

# Install tree-sitter and java parser
install_package("tree-sitter")
install_package("tree-sitter-java")

# Now import after installation
from tree_sitter import Language, Parser
import tree_sitter_java as tsjava

class GitHubJavaMiner:
    """
    GitHub Java method miner
    Extracts Java methods from repositories listed in SEART-GHS CSV
    """

    def __init__(self, csv_path):
        """
        Initialize the miner

        Args:
            csv_path: Path to SEART-GHS CSV file
        """
        # Verify CSV path
        if not os.path.exists(csv_path):
            raise FileNotFoundError(f"CSV file not found: {csv_path}")
        self.csv_path = csv_path

        # Use project root from environment
        project_root = os.environ.get('PROJECT_ROOT', os.getcwd())

        # Setup directories
        self.data_dir = os.path.join(project_root, "data")
        self.repos_dir = os.path.join(self.data_dir, "repositories")
        self.methods_dir = os.path.join(self.data_dir, "methods")

        for dir_path in [self.data_dir, self.repos_dir, self.methods_dir]:
            os.makedirs(dir_path, exist_ok=True)

        # Target number of methods
        self.target_methods = 50000

        # Setup Tree-sitter parser
        self.setup_parser()

        # Statistics - initialize properly
        self.stats = {
            'total_repos_in_csv': 0,
            'repos_after_filtering': 0,
            'repos_processed': 0,
            'total_methods_extracted': 0,
            'methods_after_deduplication': 0,
            'methods_after_length_filter': 0,
            'train_methods': 0,
            'test_methods': 0,
            'target_reached': False
        }

    def setup_parser(self):
        """Setup Tree-sitter Java parser"""
        try:
            self.JAVA_LANGUAGE = Language(tsjava.language())
            self.parser = Parser(self.JAVA_LANGUAGE)
            print("‚úÖ Tree-sitter Java parser initialized")
        except Exception as e:
            print(f"‚ùå Tree-sitter initialization failed: {e}")
            self.parser = None

    def load_repository_list(self):
        """
        Load and filter repository list from CSV

        Returns:
            List of repository URLs
        """
        print(f"\nüìä Loading repository list: {self.csv_path}")

        try:
            df = pd.read_csv(self.csv_path)
            self.stats['total_repos_in_csv'] = len(df)
            print(f"  Original repositories in CSV: {self.stats['total_repos_in_csv']}")

            # Apply filters as specified in assignment
            if "mainLanguage" in df.columns:
                df = df[df["mainLanguage"].astype(str).str.lower() == "java"]
                print(f"  After Java language filter: {len(df)}")

            if "commits" in df.columns:
                df = df[df["commits"] >= 100]
                print(f"  After commits >= 100 filter: {len(df)}")

            if "contributors" in df.columns:
                df = df[df["contributors"] >= 10]
                print(f"  After contributors >= 10 filter: {len(df)}")

            if "isFork" in df.columns:
                df = df[df["isFork"] == False]
                print(f"  After non-fork filter: {len(df)}")

            # Construct GitHub URLs
            if "name" not in df.columns:
                raise ValueError("CSV must contain 'name' column")

            repo_urls = [
                f"https://github.com/{repo_name.strip()}.git"
                for repo_name in df["name"].astype(str)
            ]

            self.stats['repos_after_filtering'] = len(repo_urls)
            print(f"  Total repositories after filtering: {self.stats['repos_after_filtering']}")

            # Process more repositories to reach 50k methods
            # Based on previous results, 50 repos gave 35k methods, so changed to ~70 repos
            needed_repos = min(70, len(repo_urls))
            print(f"  Will process {needed_repos} repositories to reach target")
            return repo_urls[:needed_repos]

        except Exception as e:
            print(f"‚ùå Failed to load CSV: {e}")
            return []

    def clone_repository(self, url):
        """Clone a single repository"""
        repo_name = url.split("/")[-1].replace(".git", "")
        dst = os.path.join(self.repos_dir, repo_name)

        # Skip if already exists
        if os.path.exists(dst):
            print(f"  ‚è≠Ô∏è  Already exists: {repo_name}")
            return dst

        print(f"  üì• Cloning: {repo_name}")

        try:
            result = subprocess.run(
                [
                    "git", "clone",
                    "--depth", "1",
                    "--single-branch",
                    url, dst
                ],
                capture_output=True,
                text=True,
                timeout=300
            )

            if result.returncode == 0:
                print(f"    ‚úÖ Cloned successfully")
                return dst
            else:
                print(f"    ‚ùå Clone failed: {result.stderr[:200]}")
                return None

        except subprocess.TimeoutExpired:
            print(f"    ‚è±Ô∏è  Timeout")
            return None

    def extract_methods_from_file(self, file_path):
        """Extract methods from a Java file using Tree-sitter"""
        methods = []

        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                code = f.read()

            if not code.strip():
                return methods

            tree = self.parser.parse(bytes(code, "utf8"))
            root = tree.root_node

            # Method node types in Tree-sitter Java
            method_types = ["method_declaration", "constructor_declaration"]

            def get_text(node):
                return code[node.start_byte:node.end_byte]

            def traverse(node):
                if node.type in method_types:
                    # Extract method name
                    name_node = node.child_by_field_name("name")
                    if not name_node:
                        return

                    method_name = get_text(name_node).strip()

                    # Skip very short method names (likely incomplete)
                    if len(method_name) < 2:
                        return

                    # Extract complete method
                    full_method = get_text(node).strip()

                    methods.append({
                        "name": method_name,
                        "body": full_method,
                        "file": file_path
                    })

                for child in node.children:
                    traverse(child)

            traverse(root)

        except Exception as e:
            print(f"    ‚ö†Ô∏è  Parsing failed {file_path}: {e}")

        return methods

    def process_repository(self, repo_path):
        """Process a single repository to extract methods"""
        methods = []

        # Find Java files
        java_files = []
        for root, dirs, files in os.walk(repo_path):
            # Skip test directories
            skip_dirs = ['test', 'tests', 'Test', 'Tests']
            dirs[:] = [d for d in dirs if d not in skip_dirs]

            for file in files:
                if file.endswith('.java'):
                    # Skip test files
                    if 'test' in file.lower() or 'Test' in file:
                        continue
                    java_files.append(os.path.join(root, file))

        if not java_files:
            print(f"    ‚ÑπÔ∏è  No Java files found")
            return methods

        print(f"    üìÑ Found {len(java_files)} Java files")

        # Process files - increase from 100 to 200 to get more methods
        max_files = min(200, len(java_files))
        processed_files = 0

        for i, java_file in enumerate(java_files[:max_files]):
            file_methods = self.extract_methods_from_file(java_file)
            methods.extend(file_methods)
            processed_files += 1

            # Progress update
            if processed_files % 50 == 0:
                print(f"    üìù Processed {processed_files}/{max_files} files, extracted {len(methods)} methods so far")

        print(f"    ‚úÖ Extracted {len(methods)} methods from {processed_files} files")
        return methods

    def deduplicate_methods(self, methods):
        """Remove duplicate methods based on method body hash"""
        seen = set()
        unique_methods = []

        for method in methods:
            method_hash = hashlib.md5(method['body'].encode()).hexdigest()
            if method_hash not in seen:
                seen.add(method_hash)
                unique_methods.append(method)

        return unique_methods

    def filter_long_methods(self, methods):
        """Filter methods longer than 256 tokens"""
        try:
            # Try to import transformers for tokenization
            from transformers import AutoTokenizer

            # Use Qwen tokenizer as specified in assignment
            tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B")

            filtered = []
            print(f"  Filtering methods with > 256 tokens...")

            for i, method in enumerate(tqdm(methods, desc="Token filtering")):
                tokens = tokenizer.encode(method['body'], add_special_tokens=False)
                if len(tokens) <= 256:
                    filtered.append(method)

                # Progress update
                if i % 10000 == 0 and i > 0:
                    print(f"    Processed {i}/{len(methods)} methods, {len(filtered)} passed filter")

            return filtered

        except Exception as e:
            print(f"‚ö†Ô∏è  Could not use tokenizer for filtering: {e}")
            print("Using simple character count filter instead")

            # Simple fallback: filter methods with more than 2000 characters
            filtered = []
            for method in methods:
                if len(method['body']) <= 2000:
                    filtered.append(method)

            return filtered

    def save_dataset(self, data, filename):
        """Save dataset as JSONL format"""
        output_path = os.path.join(self.methods_dir, filename)

        with open(output_path, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + '\\n')

        print(f"üíæ Saved {len(data)} items to {filename}")
        return output_path

    def run(self):
        """
        Run the complete data mining pipeline

        Returns:
            (train_methods, test_methods) or None if failed
        """
        print("=" * 60)
        print("üöÄ Starting GitHub Java method mining")
        print("=" * 60)

        start_time = datetime.now()

        try:
            # 1. Load repository list
            repo_urls = self.load_repository_list()

            if not repo_urls:
                print("‚ùå No valid repository URLs found")
                return None

            # 2. Process repositories
            all_methods = []
            print(f"\nüîç Processing {len(repo_urls)} repositories...")

            for i, repo_url in enumerate(repo_urls):
                # Check if we've reached target
                if len(all_methods) >= self.target_methods:
                    print(f"üéØ Reached target of {self.target_methods} methods")
                    self.stats['target_reached'] = True
                    break

                # Clone repository
                repo_path = self.clone_repository(repo_url)
                if not repo_path:
                    continue

                # Extract methods
                try:
                    methods = self.process_repository(repo_path)
                    all_methods.extend(methods)
                    self.stats['repos_processed'] += 1

                    current_total = len(all_methods)
                    remaining = self.target_methods - current_total
                    print(f"  üìä Repo {i+1}/{len(repo_urls)}: +{len(methods)} methods, Total: {current_total}, Remaining: {max(0, remaining)}")

                except Exception as e:
                    print(f"  ‚ùå Processing failed: {e}")

                # Clean up
                try:
                    shutil.rmtree(repo_path, ignore_errors=True)
                except:
                    pass

            # Record total extracted methods
            self.stats['total_methods_extracted'] = len(all_methods)

            # 3. Data cleaning
            print(f"\nüßπ Data cleaning...")
            print(f"  Total methods extracted: {self.stats['total_methods_extracted']}")

            if not all_methods:
                print("‚ùå No methods extracted")
                return None

            # Deduplication
            unique_methods = self.deduplicate_methods(all_methods)
            self.stats['methods_after_deduplication'] = len(unique_methods)
            print(f"  After deduplication: {self.stats['methods_after_deduplication']}")

            # Filter long methods (max 256 tokens)
            filtered_methods = self.filter_long_methods(unique_methods)
            self.stats['methods_after_length_filter'] = len(filtered_methods)
            print(f"  After length filtering: {self.stats['methods_after_length_filter']}")

            if not filtered_methods:
                print("‚ùå No methods after filtering")
                return None

            # Check if we have enough methods
            if self.stats['methods_after_length_filter'] < 10000:
                print(f"‚ö†Ô∏è  Warning: Only {self.stats['methods_after_length_filter']} methods after filtering")
                print("   Consider processing more repositories or adjusting filters")

            # 4. Split dataset (80% train, 20% test)
            print(f"\nüìä Splitting dataset (80% train, 20% test)...")
            train_methods, test_methods = train_test_split(
                filtered_methods,
                test_size=0.2,
                random_state=42,
                shuffle=True
            )

            self.stats['train_methods'] = len(train_methods)
            self.stats['test_methods'] = len(test_methods)

            print(f"  Training set: {self.stats['train_methods']} methods")
            print(f"  Test set: {self.stats['test_methods']} methods")

            # 5. Save datasets
            print(f"\nüíæ Saving datasets...")
            train_path = self.save_dataset(train_methods, "train_dataset.jsonl")
            test_path = self.save_dataset(test_methods, "test_dataset.jsonl")

            # 6. Save metadata
            metadata = {
                "statistics": self.stats,
                "target_methods": self.target_methods,
                "train_size": len(train_methods),
                "test_size": len(test_methods),
                "start_time": start_time.isoformat(),
                "end_time": datetime.now().isoformat(),
                "duration": str(datetime.now() - start_time),
                "note": "Assignment 1 Step 1: Data collection for Java method naming"
            }

            metadata_path = os.path.join(self.methods_dir, "metadata.json")
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2)

            # 7. Print summary
            print("=" * 60)
            print("‚úÖ Data mining completed!")
            print("=" * 60)
            print(f"‚è±Ô∏è  Total time: {datetime.now() - start_time}")
            print(f"üìä Final statistics:")
            print(f"  - Repositories in CSV: {self.stats['total_repos_in_csv']}")
            print(f"  - Repositories after filtering: {self.stats['repos_after_filtering']}")
            print(f"  - Repositories processed: {self.stats['repos_processed']}")
            print(f"  - Total methods extracted: {self.stats['total_methods_extracted']}")
            print(f"  - After deduplication: {self.stats['methods_after_deduplication']}")
            print(f"  - After length filtering: {self.stats['methods_after_length_filter']}")
            print(f"  - Training set: {self.stats['train_methods']}")
            print(f"  - Test set: {self.stats['test_methods']}")
            print(f"  - Target reached: {self.stats['target_reached']}")
            print(f"üìÅ Output location: {self.methods_dir}")
            print("=" * 60)

            return train_methods, test_methods

        except Exception as e:
            print(f"‚ùå Error during mining: {e}")
            import traceback
            traceback.print_exc()
            return None

# Main execution when run as script
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='GitHub Java Method Miner')
    parser.add_argument('--csv', required=True, help='Path to SEART-GHS CSV file')

    args = parser.parse_args()

    # Create miner and run
    miner = GitHubJavaMiner(csv_path=args.csv)
    result = miner.run()

    if result:
        print("‚úÖ Mining completed successfully!")
    else:
        print("‚ùå Mining failed")
        sys.exit(1)
'''

# Save the script file
scripts_dir = os.environ.get('SCRIPTS_DIR', os.path.join(os.environ['PROJECT_ROOT'], 'scripts'))
script_path = os.path.join(scripts_dir, 'github_miner.py')

# Ensure scripts directory exists
os.makedirs(scripts_dir, exist_ok=True)

with open(script_path, 'w', encoding='utf-8') as f:
    f.write(data_mining_script)

print(f"‚úÖ Data mining script created: {script_path}")


üì¶ Creating data mining script...
‚úÖ Data mining script created: /content/drive/MyDrive/method_naming_project/scripts/github_miner.py


### Step 1.11: Run and Test Java method mining function

In [None]:
# Import and run
from scripts.github_miner import GitHubJavaMiner

miner = GitHubJavaMiner(
    csv_path=os.path.join(DATASETS_DIR, "github_repos.csv")
)

train_methods, test_methods = miner.run()

if train_methods:
    print("‚úÖ Data mining completed successfully!")
    # Proceed with Assignment 1 Step 2 (model fine-tuning)
else:
    print("‚ùå Check the CSV file and internet connection")

‚úÖ Tree-sitter Java parser initialized
üöÄ Starting GitHub Java method mining
üìä Loading repository list: /content/drive/MyDrive/method_naming_project/datasets/github_repos.csv
  Original repositories in CSV: 14786
  After Java language filter: 14786
  After commits >= 100 filter: 14786
  After contributors >= 10 filter: 14786
  After non-fork filter: 14786
  Total repositories after filtering: 14786
  Will process 70 repositories to reach target
üîç Processing 70 repositories...
  üì• Cloning: java-memcached-client
    ‚úÖ Cloned successfully
    üìÑ Found 202 Java files
    üìù Processed 50/200 files, extracted 765 methods so far
    üìù Processed 100/200 files, extracted 965 methods so far
    üìù Processed 150/200 files, extracted 1259 methods so far
    üìù Processed 200/200 files, extracted 1608 methods so far
    ‚úÖ Extracted 1608 methods from 200 files
  üìä Repo 1/70: +1608 methods, Total: 1608, Remaining: 48392
  üì• Cloning: scala-maven-plugin
    ‚úÖ Cloned su

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

  Filtering methods with > 256 tokens...


Token filtering:  21%|‚ñà‚ñà‚ñè       | 10254/47832 [00:04<00:20, 1824.06it/s]

    Processed 10000/47832 methods, 9682 passed filter


Token filtering:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 20153/47832 [00:08<00:11, 2387.47it/s]

    Processed 20000/47832 methods, 19229 passed filter


Token filtering:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 30397/47832 [00:13<00:08, 2016.53it/s]

    Processed 30000/47832 methods, 28330 passed filter


Token filtering:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 36815/47832 [00:18<00:14, 742.26it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (41547 > 32768). Running this sequence through the model will result in indexing errors
Token filtering:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 40408/47832 [00:20<00:02, 3084.70it/s]

    Processed 40000/47832 methods, 37540 passed filter


Token filtering: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47832/47832 [00:23<00:00, 1999.15it/s]


  After length filtering: 44851
üìä Splitting dataset (80% train, 20% test)...
  Training set: 35880 methods
  Test set: 8971 methods
üíæ Saving datasets...
üíæ Saved 35880 items to train_dataset.jsonl
üíæ Saved 8971 items to test_dataset.jsonl
‚úÖ Data mining completed!
‚è±Ô∏è  Total time: 0:13:02.799242
üìä Final statistics:
  - Repositories in CSV: 14786
  - Repositories after filtering: 14786
  - Repositories processed: 45
  - Total methods extracted: 50246
  - After deduplication: 47832
  - After length filtering: 44851
  - Training set: 35880
  - Test set: 8971
  - Target reached: True
üìÅ Output location: /content/drive/MyDrive/method_naming_project/data/methods
‚úÖ Data mining completed successfully!
