In [1]:
import re
import nbformat
from nbformat.v4 import new_notebook, new_code_cell
import os
import pandas as pd


### Detect improper preprocessing before train_test_split
def detect_pipeline_wrapping(lines):
    preprocessing_keywords = [
        "StandardScaler(", "MinMaxScaler(", "LabelEncoder(",
        "fit_transform(", "transform(", "normalize("
    ]
    split_index = -1
    for i, line in enumerate(lines):
        if "train_test_split" in line:
            split_index = i
            break
    if split_index == -1:
        return False  # train_test_split not found

    # Check for preprocessing lines above the split
    for i in range(split_index):
        if any(keyword in lines[i] for keyword in preprocessing_keywords):
            return True
    return False

### Fix Strategy Recommendation
def recommend_fix_strategy(lines, pipeline_leakage=False):
    if pipeline_leakage:
        return "Pipeline Wrapping"
    for line in lines:
        if re.search(r"(train_test_split\(|X_train.*X_test|y_train.*y_test)", line):
            return "Train/Validation/Test Split"
    return "K-Fold Cross Validation"

### Annotation Core
def annotate_file(source_path, output_folder):
    with open(source_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned_lines = []
    modified = False
    pipeline_leakage = detect_pipeline_wrapping(lines)

    leakage_patterns = [
        r"validation_data\s*=\s*\(\s*test_X\s*,\s*test_y\s*\)",
        r"validation_data\s*=\s*\(\s*\w*[xX]_test\s*,\s*\w*[yY]_test\s*\)",
        r"validation_data\s*=\s*\(.*_test.*\)",
        r"\.predict\s*\(\s*X_test\s*\)",
        r"\.predict_proba\s*\(\s*X_test\s*\)",
        r"\.score\s*\(\s*X_test\s*,\s*y_test\s*\)",
        r"classification_report\s*\(\s*y_test\s*,.*\)",
        r"confusion_matrix\s*\(\s*y_test\s*,.*\)",
        r"roc_auc_score\s*\(\s*y_test\s*,.*\)",
        r"roc_curve\s*\(\s*y_test\s*,.*\)",
        r"mean_squared_error\s*\(\s*y_test\s*,",
        r"r2_score\s*\(\s*y_test\s*,",
        r"mean_absolute_error\s*\(\s*y_test\s*,",
        r"explained_variance_score\s*\(\s*y_test\s*,",
        r"for\s+\w+,\s*\w+\s+in\s+test_loader",
        r"outputs\s*=\s*model\s*\(\s*\w+\s*\)",
        r"loss\s*=\s*criterion\s*\(\s*outputs\s*,\s*\w*labels?\s*\)",
        r"\.eval\s*\(",
        r"\(\s*y_test\s*,.*\)",
        r"\(\s*.*,\s*y_test\s*\)"
    ]

    test_usage_count = sum(
        any(re.search(p, line.replace(" ", "")) for p in leakage_patterns)
        for line in lines
    )

    fix_type = recommend_fix_strategy(lines, pipeline_leakage)
    folds = max(2, min(test_usage_count, 10))

    for line in lines:
    # If we haven't already injected and this line matches a leakage pattern
        if not modified and any(re.search(p, line.replace(" ", "")) for p in leakage_patterns):
            cleaned_lines.append("# WARNING: Multi Test Leakage — test data is being used improperly before final evaluation.\n")
            if fix_type == "K-Fold Cross Validation":
                cleaned_lines.append(f"# QUICK FIX: Use K-Fold Cross Validation with n_splits={folds} (based on {test_usage_count} uses of test data).\n")
            else:
                cleaned_lines.append("# QUICK FIX: Use a proper Train/Validation/Test split to ensure test data is untouched until final evaluation.\n")
            modified = True  # only add once
        cleaned_lines.append(line)


    base_filename = os.path.splitext(os.path.basename(source_path))[0]
    py_output_path = os.path.join(output_folder, base_filename + "_annotated.py")
    ipynb_output_path = os.path.join(output_folder, base_filename + "_annotated.ipynb")

    # Save .py
    with open(py_output_path, "w", encoding="utf-8") as f:
        f.writelines(cleaned_lines)

    # Save .ipynb
    cells, cell_lines = [], []
    for line in cleaned_lines:
        if line.strip() == "" and cell_lines:
            cells.append(new_code_cell("".join(cell_lines)))
            cell_lines = []
        else:
            cell_lines.append(line)
    if cell_lines:
        cells.append(new_code_cell("".join(cell_lines)))

    nb = new_notebook(cells=cells, metadata={"language": "python"})
    with open(ipynb_output_path, "w", encoding="utf-8") as f:
        nbformat.write(nb, f)

    print(f"Annotated: {base_filename}.py → saved in {output_folder}")
    return {
        "filename": base_filename + ".py",
        "leakage_detected": "Yes" if test_usage_count > 0 or pipeline_leakage else "No",
        "recommended_fix": fix_type if (test_usage_count > 0 or pipeline_leakage) else "N/A"
    }

### Folder-Wide Runner
def run_leakage_detector_on_folder(folder_path=None):
    if not folder_path:
        print("Enter the path to a folder containing .py files:")
        folder_path = input("Folder path: ").strip()

    if not os.path.isdir(folder_path):
        print("Folder not found. Please check the path and try again.")
        return

    output_folder = os.path.join(folder_path, "annotated_output")
    os.makedirs(output_folder, exist_ok=True)

    py_files = [f for f in os.listdir(folder_path) if f.endswith(".py")]
    if not py_files:
        print("No .py files found in the folder.")
        return

    summary_records = []
    for py_file in py_files:
        full_path = os.path.join(folder_path, py_file)
        result = annotate_file(full_path, output_folder)
        summary_records.append(result)

    # Export summary to Excel
    summary_path = os.path.join(output_folder, "leakage_summary.xlsx")
    pd.DataFrame(summary_records).to_excel(summary_path, index=False)
    print(f"\n📄 Summary report saved as: {summary_path}")
    print(f"✅ All annotated files saved to: {output_folder}")


run_leakage_detector_on_folder("D:\my_scripts")


✅ Annotated: 2021-09-01-nb_604.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-03-nb_2630.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-03-nb_891.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-04-nb_2749.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-05-nb_1162.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-05-nb_1630.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-05-nb_2770.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-05-nb_2816.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-05-nb_773.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-07-nb_2921.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-07-nb_949.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-08-nb_678.py → saved in D:\my_scripts\annotated_output
✅ Annotated: 2021-09-09-nb_1559.py → saved in D:\my_scripts\annotated_output
✅ An