In [1]:
import os

# Path to your local 'labels' folder
labels_folder = '../brain_tumor_classification/data/labels/'

# List all the files in the 'labels' folder to confirm
labels_files = os.listdir(labels_folder)
print(labels_files[:5])  # Display the first 5 files to confirm



['00054_145.txt', '00054_164.txt', '00056_110.txt', '00056_129.txt', '00056_147.txt']


In [4]:
# Loop through all files in the 'labels' folder and read their content
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.read()

    # Print the content of each file
    print(f"Content of {label_file}:")
    print(label_data)
    print("\n" + "-"*50 + "\n")  # Print a separator for better readability


Content of 00054_145.txt:
1 0.344484 0.342723 0.221831 0.176056

--------------------------------------------------

Content of 00054_164.txt:
1 0.347418 0.335681 0.192488 0.190141

--------------------------------------------------

Content of 00056_110.txt:
1 0.579225 0.439554 0.111502 0.083333
1 0.464789 0.468897 0.117371 0.106808

--------------------------------------------------

Content of 00056_129.txt:
1 0.520540 0.448944 0.233568 0.132629

--------------------------------------------------

Content of 00056_147.txt:
1 0.518779 0.416667 0.150235 0.070423

--------------------------------------------------

Content of 00056_239.txt:
1 0.638498 0.432512 0.068075 0.066901

--------------------------------------------------

Content of 00056_92.txt:
1 0.483568 0.477700 0.145540 0.068075

--------------------------------------------------

Content of 00058_102.txt:
1 0.694249 0.450117 0.088028 0.066901

--------------------------------------------------

Content of 00058_122.txt:
1

In [5]:
# Initialize a counter for invalid or null lines
invalid_line_count = 0

# Loop through all files in the 'labels' folder and check for invalid lines
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # Loop through each line in the file
    for line in label_data:
        # Strip any leading/trailing whitespace
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Split the line by spaces
        parts = line.split()

        # Check if the line has exactly 5 values
        if len(parts) != 5:
            invalid_line_count += 1
            continue

        # Validate each part of the line
        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)
            if classification not in [0, 1]:  # Check if classification is either 0 or 1
                invalid_line_count += 1
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_line_count += 1
            continue

# Print the result
print(f"Total number of invalid or null lines across all files: {invalid_line_count}")


Total number of invalid or null lines across all files: 0


In [6]:
# Initialize a counter for files that contain exactly two valid lines
valid_files_count = 0
valid_file_contents = []

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines
    valid_lines = []

    # Loop through each line in the file
    for line in label_data:
        # Strip any leading/trailing whitespace
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Split the line by spaces
        parts = line.split()

        # Check if the line has exactly 5 values
        if len(parts) == 5:
            try:
                classification = int(parts[0])  # The first value is the classification (1 or 0)
                if classification not in [0, 1]:  # Check if classification is either 0 or 1
                    continue

                # The next four values should be floats (bounding box coordinates and dimensions)
                bounding_box = [float(parts[i]) for i in range(1, 5)]

                # If we reach here, the line is valid
                valid_lines.append(line)

            except ValueError:
                # If there is a ValueError, it means one of the values is not valid
                continue

    # Check if the file contains exactly two valid lines
    if len(valid_lines) == 2:
        valid_files_count += 1
        valid_file_contents.append(f"Content of {label_file}:\n" + "\n".join(valid_lines))

# Print the result
print(f"Total number of .txt files with exactly 2 valid lines: {valid_files_count}")
print("\n".join(valid_file_contents))  # Print content of valid files


Total number of .txt files with exactly 2 valid lines: 41
Content of 00056_110.txt:
1 0.579225 0.439554 0.111502 0.083333
1 0.464789 0.468897 0.117371 0.106808
Content of 00059_84.txt:
1 0.473005 0.393192 0.077465 0.075117
1 0.515258 0.466549 0.044601 0.038732
Content of 00060_69.txt:
1 0.653756 0.425469 0.049296 0.055164
1 0.519366 0.460681 0.132629 0.099765
Content of 00060_70.txt:
1 0.513498 0.455986 0.125587 0.118545
1 0.660798 0.431925 0.046948 0.056338
Content of 00064_101.txt:
0 0.348005 0.400822 0.076291 0.076291
0 0.447183 0.381455 0.068075 0.077465
Content of 00105_132.txt:
1 0.633216 0.332746 0.102113 0.113850
1 0.630282 0.438380 0.084507 0.102113
Content of 00105_151.txt:
1 0.620305 0.336268 0.106808 0.127934
1 0.551643 0.411972 0.110329 0.107981
Content of 00112_10.txt:
0 0.752113 0.740023 0.090141 0.073944
0 0.713380 0.814554 0.108451 0.077465
Content of 00142_107.txt:
0 0.355634 0.401995 0.089202 0.073944
0 0.523474 0.534624 0.197183 0.111502
Content of 00142_126.txt:
0 

In [7]:
# Initialize a list to store the filenames of files with more than 2 valid lines
files_with_more_than_2_lines = []

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines
    valid_lines = []

    # Loop through each line in the file
    for line in label_data:
        # Strip any leading/trailing whitespace
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Split the line by spaces
        parts = line.split()

        # Check if the line has exactly 5 values
        if len(parts) == 5:
            try:
                classification = int(parts[0])  # The first value is the classification (1 or 0)
                if classification not in [0, 1]:  # Check if classification is either 0 or 1
                    continue

                # The next four values should be floats (bounding box coordinates and dimensions)
                bounding_box = [float(parts[i]) for i in range(1, 5)]

                # If we reach here, the line is valid
                valid_lines.append(line)

            except ValueError:
                # If there is a ValueError, it means one of the values is not valid
                continue

    # Check if the file contains more than 2 valid lines
    if len(valid_lines) > 2:
        files_with_more_than_2_lines.append(label_file)

# Print the result: the filenames of .txt files with more than 2 valid lines
print("Files with more than 2 valid lines:")
print("\n".join(files_with_more_than_2_lines))


Files with more than 2 valid lines:
00061_127.txt
63 (7).txt
67 (7).txt


In [8]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize counters for invalid class 0 and class 1 files
invalid_class_0_files = 0
invalid_class_1_files = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines and flag to track if the file is invalid
    valid_lines = []
    invalid_data_found = False
    file_classification = None  # Variable to store the classification of the file

    # Loop through each line in the file
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the line has exactly 5 values (classification + 4 bounding box values)
        if len(parts) != 5:
            invalid_data_found = True
            continue

        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)

            if classification not in [0, 1]:  # If classification is not 0 or 1, mark as invalid
                invalid_data_found = True
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

            # If the line is valid, add it to the valid lines list
            valid_lines.append(line)

            # Check if all lines in the file have the same classification
            if file_classification is None:
                file_classification = classification
            elif file_classification != classification:
                invalid_data_found = True
                continue

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_data_found = True
            continue

    # If the file contains more than 1 valid line, mark it as invalid
    if len(valid_lines) > 1:
        invalid_data_found = True

    # If the file is invalid, count it under class 0 or class 1
    if invalid_data_found:
        if file_classification == 0:
            invalid_class_0_files += 1
        elif file_classification == 1:
            invalid_class_1_files += 1

        # Print the file name and its labels
        print(f"\nInvalid file: {label_file}")
        for line in label_data:
            print(line.strip())

# Print the result: Count of invalid class 0 and class 1 files
print(f"\nNumber of invalid class 0 files: {invalid_class_0_files}")
print(f"Number of invalid class 1 files: {invalid_class_1_files}")



Invalid file: 00056_110.txt
1 0.579225 0.439554 0.111502 0.083333
1 0.464789 0.468897 0.117371 0.106808

Invalid file: 00059_84.txt
1 0.473005 0.393192 0.077465 0.075117
1 0.515258 0.466549 0.044601 0.038732

Invalid file: 00060_69.txt
1 0.653756 0.425469 0.049296 0.055164
1 0.519366 0.460681 0.132629 0.099765

Invalid file: 00060_70.txt
1 0.513498 0.455986 0.125587 0.118545
1 0.660798 0.431925 0.046948 0.056338

Invalid file: 00061_127.txt
0 0.569249 0.485329 0.091549 0.085681
0 0.480634 0.538732 0.048122 0.046948
0 0.524061 0.461268 0.038732 0.037559

Invalid file: 00064_101.txt
0 0.348005 0.400822 0.076291 0.076291
0 0.447183 0.381455 0.068075 0.077465

Invalid file: 00105_132.txt
1 0.633216 0.332746 0.102113 0.113850
1 0.630282 0.438380 0.084507 0.102113

Invalid file: 00105_151.txt
1 0.620305 0.336268 0.106808 0.127934
1 0.551643 0.411972 0.110329 0.107981

Invalid file: 00112_10.txt
0 0.752113 0.740023 0.090141 0.073944
0 0.713380 0.814554 0.108451 0.077465

Invalid file: 00142_

In [9]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize a counter for files with inconsistent classifications
inconsistent_classification_files_count = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # If the file has more than 1 line, check classification consistency
    if len(label_data) > 1:
        classifications = set()  # Set to track unique classifications

        # Check all lines for the same classification
        for line in label_data:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 5:
                    classification = int(parts[0])  # First value is classification
                    classifications.add(classification)

        # If there are multiple different classifications in the file, print and count it
        if len(classifications) > 1:
            print(f"File with inconsistent classifications: {label_file}")
            inconsistent_classification_files_count += 1

# Print the result: Number of files with inconsistent classifications
print(f"\nTotal number of files with inconsistent classifications: {inconsistent_classification_files_count}")



Total number of files with inconsistent classifications: 0


In [10]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Counter for how many files we've printed
files_printed = 0
max_files_to_print = 5  # Limit the number of files to print

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # Flag to track if the file contains class 0
    file_modified = False
    old_content = label_data  # Store the old content

    # Loop through each line in the file
    modified_lines = []
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the classification is 0
        if len(parts) == 5 and int(parts[0]) == 0:
            # Change the line to "0 0 0 0 0"
            modified_lines.append("0 0 0 0 0")
            file_modified = True
        else:
            # If it's not class 0, keep the line unchanged
            modified_lines.append(line)

    # If the file was modified (contains class 0), overwrite it with the new content
    if file_modified:
        with open(file_path, 'w') as file:
            file.write("\n".join(modified_lines))

        # Print only the first few modified files
        if files_printed < max_files_to_print:
            print(f"\nFile modified: {label_file}")

            # Print the old content
            print("Old Content:")
            print("".join(old_content))

            # Print the new content
            print("New Content:")
            print("\n".join(modified_lines))

            files_printed += 1

# Optionally, if you want to ensure we stop printing after reaching the limit:
if files_printed == max_files_to_print:
    print("\nLimited number of files printed.")



File modified: 00061_108.txt
Old Content:
0 0.568662 0.474765 0.064554 0.050469
New Content:
0 0 0 0 0

File modified: 00061_127.txt
Old Content:
0 0.569249 0.485329 0.091549 0.085681
0 0.480634 0.538732 0.048122 0.046948
0 0.524061 0.461268 0.038732 0.037559
New Content:
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0

File modified: 00061_146.txt
Old Content:
0 0.575704 0.484155 0.069249 0.055164
New Content:
0 0 0 0 0

File modified: 00061_184.txt
Old Content:
0 0.376174 0.250000 0.092723 0.091549
New Content:
0 0 0 0 0

File modified: 00064_101.txt
Old Content:
0 0.348005 0.400822 0.076291 0.076291
0 0.447183 0.381455 0.068075 0.077465
New Content:
0 0 0 0 0
0 0 0 0 0

Limited number of files printed.


In [11]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize counters for invalid class 0 and class 1 files
invalid_class_0_files = 0
invalid_class_1_files = 0
deleted_class_1_files = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines and flag to track if the file is invalid
    valid_lines = []
    invalid_data_found = False
    file_classification = None  # Variable to store the classification of the file

    # Loop through each line in the file
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the line has exactly 5 values (classification + 4 bounding box values)
        if len(parts) != 5:
            invalid_data_found = True
            continue

        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)

            if classification not in [0, 1]:  # If classification is not 0 or 1, mark as invalid
                invalid_data_found = True
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

            # If the line is valid, add it to the valid lines list
            valid_lines.append(line)

            # Check if all lines in the file have the same classification
            if file_classification is None:
                file_classification = classification
            elif file_classification != classification:
                invalid_data_found = True
                continue

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_data_found = True
            continue

    # If the file contains more than 1 valid line, mark it as invalid
    if len(valid_lines) > 1:
        invalid_data_found = True

    # If the file is invalid, count it under class 0 or class 1
    if invalid_data_found:
        if file_classification == 0:
            invalid_class_0_files += 1
        elif file_classification == 1:
            invalid_class_1_files += 1

        # Print the file name and its labels
        print(f"\nInvalid file: {label_file}")
        for line in label_data:
            print(line.strip())

        # If the classification is 1, delete the file
        if file_classification == 1:
            os.remove(file_path)
            deleted_class_1_files += 1

# Print the result: Count of invalid class 0 and class 1 files
print(f"\nNumber of invalid class 0 files: {invalid_class_0_files}")
print(f"Number of invalid class 1 files: {invalid_class_1_files}")
print(f"Number of class 1 files deleted: {deleted_class_1_files}")



Invalid file: 00056_110.txt
1 0.579225 0.439554 0.111502 0.083333
1 0.464789 0.468897 0.117371 0.106808

Invalid file: 00059_84.txt
1 0.473005 0.393192 0.077465 0.075117
1 0.515258 0.466549 0.044601 0.038732

Invalid file: 00060_69.txt
1 0.653756 0.425469 0.049296 0.055164
1 0.519366 0.460681 0.132629 0.099765

Invalid file: 00060_70.txt
1 0.513498 0.455986 0.125587 0.118545
1 0.660798 0.431925 0.046948 0.056338

Invalid file: 00061_127.txt
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0

Invalid file: 00064_101.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00105_132.txt
1 0.633216 0.332746 0.102113 0.113850
1 0.630282 0.438380 0.084507 0.102113

Invalid file: 00105_151.txt
1 0.620305 0.336268 0.106808 0.127934
1 0.551643 0.411972 0.110329 0.107981

Invalid file: 00112_10.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_107.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_126.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_144.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00147_170.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00159_

In [12]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize counters for invalid class 0 and class 1 files
invalid_class_0_files = 0
invalid_class_1_files = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines and flag to track if the file is invalid
    valid_lines = []
    invalid_data_found = False
    file_classification = None  # Variable to store the classification of the file

    # Loop through each line in the file
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the line has exactly 5 values (classification + 4 bounding box values)
        if len(parts) != 5:
            invalid_data_found = True
            continue

        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)

            if classification not in [0, 1]:  # If classification is not 0 or 1, mark as invalid
                invalid_data_found = True
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

            # If the line is valid, add it to the valid lines list
            valid_lines.append(line)

            # Check if all lines in the file have the same classification
            if file_classification is None:
                file_classification = classification
            elif file_classification != classification:
                invalid_data_found = True
                continue

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_data_found = True
            continue

    # If the file contains more than 1 valid line, mark it as invalid
    if len(valid_lines) > 1:
        invalid_data_found = True

    # If the file is invalid, count it under class 0 or class 1
    if invalid_data_found:
        if file_classification == 0:
            invalid_class_0_files += 1
        elif file_classification == 1:
            invalid_class_1_files += 1

        # Print the file name and its labels
        print(f"\nInvalid file: {label_file}")
        for line in label_data:
            print(line.strip())

# Print the result: Count of invalid class 0 and class 1 files
print(f"\nNumber of invalid class 0 files: {invalid_class_0_files}")
print(f"Number of invalid class 1 files: {invalid_class_1_files}")



Invalid file: 00061_127.txt
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0

Invalid file: 00064_101.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00112_10.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_107.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_126.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00142_144.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 00147_170.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 125.txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 128 (2).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 64 (3).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 71 (3).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 72 (3).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 73 (2).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 74 (3).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 76 (4).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 78 (2).txt
0 0 0 0 0
0 0 0 0 0

Invalid file: 84.txt
0 0 0 0 0
0 0 0 0 0

Number of invalid class 0 files: 17
Number of invalid class 1 files: 0


In [13]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize counters for invalid class 0 and class 1 files
invalid_class_0_files = 0
invalid_class_1_files = 0
modified_class_0_files = 0
deleted_class_1_files = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines and flag to track if the file is invalid
    valid_lines = []
    invalid_data_found = False
    file_classification = None  # Variable to store the classification of the file

    # Loop through each line in the file
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the line has exactly 5 values (classification + 4 bounding box values)
        if len(parts) != 5:
            invalid_data_found = True
            continue

        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)

            if classification not in [0, 1]:  # If classification is not 0 or 1, mark as invalid
                invalid_data_found = True
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

            # If the line is valid, add it to the valid lines list
            valid_lines.append(line)

            # Check if all lines in the file have the same classification
            if file_classification is None:
                file_classification = classification
            elif file_classification != classification:
                invalid_data_found = True
                continue

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_data_found = True
            continue

    # If the file contains more than 1 valid line, mark it as invalid
    if len(valid_lines) > 1:
        invalid_data_found = True

    # If the file is invalid and it's class 0, modify it
    if invalid_data_found and file_classification == 0:
        # Only keep one valid line and discard the rest
        valid_lines = [valid_lines[0]]  # Keep only the first valid line

        # Overwrite the file with just the one valid line
        with open(file_path, 'w') as file:
            file.write("\n".join(valid_lines))

        modified_class_0_files += 1
        print(f"File modified (class 0, invalid data): {label_file}")

    # If the file is invalid and class 1, delete the file
    if invalid_data_found and file_classification == 1:
        os.remove(file_path)
        deleted_class_1_files += 1
        print(f"File deleted (class 1, invalid data): {label_file}")

    # Count invalid class 0 and class 1 files
    if file_classification == 0:
        invalid_class_0_files += 1
    elif file_classification == 1:
        invalid_class_1_files += 1

# Print the result: Count of invalid class 0 and class 1 files
print(f"\nNumber of invalid class 0 files: {invalid_class_0_files}")
print(f"Number of invalid class 1 files: {invalid_class_1_files}")
print(f"Number of class 0 files modified (kept 1 valid line): {modified_class_0_files}")
print(f"Number of class 1 files deleted: {deleted_class_1_files}")


File modified (class 0, invalid data): 00061_127.txt
File modified (class 0, invalid data): 00064_101.txt
File modified (class 0, invalid data): 00112_10.txt
File modified (class 0, invalid data): 00142_107.txt
File modified (class 0, invalid data): 00142_126.txt
File modified (class 0, invalid data): 00142_144.txt
File modified (class 0, invalid data): 00147_170.txt
File modified (class 0, invalid data): 125.txt
File modified (class 0, invalid data): 128 (2).txt
File modified (class 0, invalid data): 64 (3).txt
File modified (class 0, invalid data): 71 (3).txt
File modified (class 0, invalid data): 72 (3).txt
File modified (class 0, invalid data): 73 (2).txt
File modified (class 0, invalid data): 74 (3).txt
File modified (class 0, invalid data): 76 (4).txt
File modified (class 0, invalid data): 78 (2).txt
File modified (class 0, invalid data): 84.txt

Number of invalid class 0 files: 419
Number of invalid class 1 files: 432
Number of class 0 files modified (kept 1 valid line): 17
Numb

In [14]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Initialize counters for invalid class 0 and class 1 files
invalid_class_0_files = 0
invalid_class_1_files = 0

# Loop through all files in the 'labels' folder
for label_file in labels_files:
    # Create the full path to each file
    file_path = os.path.join(labels_folder, label_file)

    # Open and read the file
    with open(file_path, 'r') as file:
        label_data = file.readlines()  # Read lines from the file

    # List to hold valid lines and flag to track if the file is invalid
    valid_lines = []
    invalid_data_found = False
    file_classification = None  # Variable to store the classification of the file

    # Loop through each line in the file
    for line in label_data:
        line = line.strip()

        if not line:
            continue

        parts = line.split()

        # Check if the line has exactly 5 values (classification + 4 bounding box values)
        if len(parts) != 5:
            invalid_data_found = True
            continue

        try:
            classification = int(parts[0])  # The first value is the classification (1 or 0)

            if classification not in [0, 1]:  # If classification is not 0 or 1, mark as invalid
                invalid_data_found = True
                continue

            # The next four values should be floats (bounding box coordinates and dimensions)
            bounding_box = [float(parts[i]) for i in range(1, 5)]

            # If the line is valid, add it to the valid lines list
            valid_lines.append(line)

            # Check if all lines in the file have the same classification
            if file_classification is None:
                file_classification = classification
            elif file_classification != classification:
                invalid_data_found = True
                continue

        except ValueError:
            # If there is a ValueError, it means one of the values is not valid
            invalid_data_found = True
            continue

    # If the file contains more than 1 valid line, mark it as invalid
    if len(valid_lines) > 1:
        invalid_data_found = True

    # If the file is invalid, count it under class 0 or class 1
    if invalid_data_found:
        if file_classification == 0:
            invalid_class_0_files += 1
        elif file_classification == 1:
            invalid_class_1_files += 1

        # Print the file name and its labels
        print(f"\nInvalid file: {label_file}")
        for line in label_data:
            print(line.strip())

# Print the result: Count of invalid class 0 and class 1 files
print(f"\nNumber of invalid class 0 files: {invalid_class_0_files}")
print(f"Number of invalid class 1 files: {invalid_class_1_files}")



Number of invalid class 0 files: 0
Number of invalid class 1 files: 0


In [16]:
import os

# Path to your labels folder
labels_folder = '../brain_tumor_classification/data/labels/'

# Get the list of all .txt files in the labels folder
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.txt')]

# Get the count of all .txt files
total_txt_files = len(labels_files)

# Print the total count
print(f"Total number of .txt files: {total_txt_files}")


Total number of .txt files: 851
