In [None]:
import os
from Bio import SeqIO

# --- Configuration ---
TARGET_DIRECTORY = "split_gbk_regions_output/" 
QUERY_GBK_FILENAME = "X65195.gbk"

# List of GenBank feature qualifier keys to check for names/labels
QUALIFIER_KEYS_TO_CHECK = [
    "gene",
    "product",
    "locus_tag",
    "label",
    "protein_id",
    "note",
    "old_locus_tag",
    "codon_tag"
]

EXCLUDE_LABEL_SUBSTRINGS = [
    "region",
    "hypothetical protein"
]

# Set to True to only print what *would* be deleted (safer)
DRY_RUN = False
# Set to False to actually delete the files
# --------------------

def get_feature_names_or_labels(gbk_filepath, qualifier_keys, exclude_substrings=None):
    """
    Parses a GenBank file and returns a set of unique names/labels
    found in the specified qualifier keys for all features.
    Excludes labels containing any of the specified substrings (case-insensitive).
    Handles potential non-string qualifier values.
    """
    names_set = set()
    try:
        record = SeqIO.read(gbk_filepath, "genbank")
        for feature in record.features:
            for key in qualifier_keys:
                if key in feature.qualifiers:
                    # feature.qualifiers[key] is always a list, e.g., ['value1', 'value2']
                    for value in feature.qualifiers[key]:
                        # Crucial check: Ensure 'value' is actually a string
                        if not isinstance(value, str):
                            continue # Skip this non-string value
                        
                        stripped_value = value.strip()
                        
                        # Apply exclusion filter if specified
                        if exclude_substrings:
                            should_exclude = False
                            for excl_sub in exclude_substrings:
                                if excl_sub.lower() in stripped_value.lower():
                                    should_exclude = True
                                    break # No need to check other exclusion substrings for this label
                            if should_exclude:
                                continue # Skip to the next label
                        
                        names_set.add(stripped_value)
        return names_set
    except Exception as e:
        # This catches general parsing errors, including the 'list' object has no attribute 'lower' if it still slips through
        print(f"Warning: Could not read or parse '{gbk_filepath}': {e}. Skipping this file.")
        return set()

def main():
    query_gbk_path = os.path.join(TARGET_DIRECTORY, QUERY_GBK_FILENAME)

    if not os.path.exists(query_gbk_path):
        print(f"Error: Query file '{query_gbk_path}' not found. Please check the path and filename.")
        return

    print(f"Loading names/labels from query file: {QUERY_GBK_FILENAME} (excluding '{', '.join(EXCLUDE_LABEL_SUBSTRINGS)}' related labels)...")
    query_names = get_feature_names_or_labels(query_gbk_path, QUALIFIER_KEYS_TO_CHECK, EXCLUDE_LABEL_SUBSTRINGS)

    if not query_names:
        print(f"Warning: Query file contains no identifiable names/labels (after excluding '{', '.join(EXCLUDE_LABEL_SUBSTRINGS)}' related ones) based on the specified keys.")
        print("All other GenBank files will be marked for deletion (or skipped in dry run).")

    print(f"Found {len(query_names)} unique names/labels in the query file (after exclusion).")

    gbk_files_to_delete = []
    
    for filename in os.listdir(TARGET_DIRECTORY):
        if filename.endswith(".gbk") and filename != QUERY_GBK_FILENAME:
            filepath = os.path.join(TARGET_DIRECTORY, filename)
            print(f"\nChecking file: {filename}")
            
            target_names = get_feature_names_or_labels(filepath, QUALIFIER_KEYS_TO_CHECK, EXCLUDE_LABEL_SUBSTRINGS)

            if not target_names:
                print(f"  No names/labels found in {filename} (after exclusion) or file is unparseable. Marking for deletion.")
                gbk_files_to_delete.append(filepath)
                continue

            # Check for any common names/labels
            common_names = query_names.intersection(target_names)

            if common_names:
                print(f"  Overlap found! Common names/labels: {', '.join(list(common_names)[:5])}...") # Show first 5
                print(f"  Keeping {filename} due to shared names/labels.")
            else:
                print(f"  No common names/labels found with {QUERY_GBK_FILENAME} (after exclusion). Marking for deletion.")
                gbk_files_to_delete.append(filepath)

    print("\n--- Summary of Files to Delete ---")
    if not gbk_files_to_delete:
        print("No files found to delete (all other files have an overlap in names/labels with the query, after exclusion).")
    else:
        for f in gbk_files_to_delete:
            print(f"- {f}")
        
        if DRY_RUN:
            print("\nThis was a DRY RUN. No files were deleted.")
            print("To actually delete these files, set DRY_RUN = False at the top of the script.")
        else:
            print("\n--- Performing Deletion ---")
            for f in gbk_files_to_delete:
                try:
                    os.remove(f)
                    print(f"Successfully deleted: {f}")
                except OSError as e:
                    print(f"Error deleting {f}: {e}")
            print("\nDeletion process complete.")

if __name__ == "__main__":
    main()

Loading names/labels from query file: X65195.gbk (excluding 'region, hypothetical protein' related labels)...
Found 114 unique names/labels in the query file (after exclusion).

Checking file: NC_018681_1_region_1.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_10.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_11.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_12.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_13.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_14.gbk
  No common names/labels found with X65195.gbk (after exclusion). Marking for deletion.

Checking file: NC_018681_1_region_15.gbk
  