### Box Plot Strategy

In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def process_csvs_with_iqr(root_dir):
    new_root = root_dir.rstrip('/\\') + '_outlier_free'
    os.makedirs(new_root, exist_ok=True)

    for i in range(5):
        filename = f"data_split{i}.csv"
        filepath = os.path.join(root_dir, filename)
        df = pd.read_csv(filepath, header=None)

        # Filter folder 0 and folder 1 separately
        folder0 = df[df[2] == 0]
        folder1 = df[df[2] == 1]
        desired_rows = pd.concat([folder0, folder1])

        if not desired_rows.empty:
            std_vals = desired_rows[5]

            # IQR method
            Q1 = std_vals.quantile(0.25)
            Q3 = std_vals.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Outlier rows
            outliers = desired_rows[(std_vals > upper_bound)]

            # Outlier counts
            out0 = len(outliers[outliers[2] == 0])
            out1 = len(outliers[outliers[2] == 1])
            out_total = len(outliers)

            # Total desired rows
            total0 = len(folder0)
            total1 = len(folder1)
            total = total0 + total1

            # Percentages
            pct0 = (out0 / total0 * 100) if total0 else 0
            pct1 = (out1 / total1 * 100) if total1 else 0
            pct_total = (out_total / total * 100) if total else 0

            print(f"File: {filename}")
            print(f"Outliers: {out0} / {out1} / {out_total}")
            print(f"Total desired rows: {total0} / {total1} / {total}")
            print(f"Percentage of outliers: {pct0:.2f}% / {pct1:.2f}% / {pct_total:.2f}%\n")

            # Remove outliers
            df_filtered = df.drop(outliers.index)
        else:
            print(f"File: {filename} has no rows with folder 0 or 1.\n")
            df_filtered = df

        # Save filtered CSV
        new_filepath = os.path.join(new_root, filename)
        df_filtered.to_csv(new_filepath, header=False, index=False)

        # Box plot
        if not desired_rows.empty:
            plot_std_boxplot(desired_rows[5], filename)

def plot_std_boxplot(std_series, title_suffix):
    plt.figure(figsize=(6, 4))
    plt.boxplot(std_series, vert=False, patch_artist=True,
                boxprops=dict(facecolor='lightblue', color='blue'),
                medianprops=dict(color='red'))
    plt.title(f"Box Plot of Std - {title_suffix}")
    plt.xlabel("Standard Deviation")
    plt.tight_layout()
    plt.savefig(f"std_boxplot_{title_suffix.replace('.csv', '')}.png")
    plt.close()

# Example usage:
process_csvs_with_iqr("../facebase/data/Adience_256x256_resnet50_imagenet_noisy_dldl_v2_clean_corrected")


File: data_split0.csv
Outliers: 74 / 19 / 93
Total desired rows: 10954 / 3069 / 14023
Percentage of outliers: 0.68% / 0.62% / 0.66%

File: data_split1.csv
Outliers: 70 / 17 / 87
Total desired rows: 11333 / 3300 / 14633
Percentage of outliers: 0.62% / 0.52% / 0.59%

File: data_split2.csv
Outliers: 81 / 21 / 102
Total desired rows: 11128 / 3274 / 14402
Percentage of outliers: 0.73% / 0.64% / 0.71%

File: data_split3.csv
Outliers: 53 / 26 / 79
Total desired rows: 10048 / 4380 / 14428
Percentage of outliers: 0.53% / 0.59% / 0.55%

File: data_split4.csv
Outliers: 72 / 43 / 115
Total desired rows: 9643 / 3679 / 13322
Percentage of outliers: 0.75% / 1.17% / 0.86%



### Constant Standard Deviation Strategy

In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def process_csvs_with_iqr(root_dir):
    new_root = root_dir.rstrip('/\\') + '_outlier_free_c100'
    os.makedirs(new_root, exist_ok=True)

    for i in range(5):
        filename = f"data_split{i}.csv"
        filepath = os.path.join(root_dir, filename)
        df = pd.read_csv(filepath, header=None)

        # Filter folder 0 and folder 1 separately
        folder0 = df[df[2] == 0]
        folder1 = df[df[2] == 1]
        desired_rows = pd.concat([folder0, folder1])

        if not desired_rows.empty:
            std_vals = desired_rows[5]

            upper_bound = 1.0

            # Outlier rows
            outliers = desired_rows[(std_vals > upper_bound)]

            # Outlier counts
            out0 = len(outliers[outliers[2] == 0])
            out1 = len(outliers[outliers[2] == 1])
            out_total = len(outliers)

            # Total desired rows
            total0 = len(folder0)
            total1 = len(folder1)
            total = total0 + total1

            # Percentages
            pct0 = (out0 / total0 * 100) if total0 else 0
            pct1 = (out1 / total1 * 100) if total1 else 0
            pct_total = (out_total / total * 100) if total else 0

            print(f"File: {filename}")
            print(f"Outliers: {out0} / {out1} / {out_total}")
            print(f"Total desired rows: {total0} / {total1} / {total}")
            print(f"Percentage of outliers: {pct0:.2f}% / {pct1:.2f}% / {pct_total:.2f}%\n")

            # Remove outliers
            df_filtered = df.drop(outliers.index)
        else:
            print(f"File: {filename} has no rows with folder 0 or 1.\n")
            df_filtered = df

        # Save filtered CSV
        new_filepath = os.path.join(new_root, filename)
        df_filtered.to_csv(new_filepath, header=False, index=False)

        # Box plot
        # if not desired_rows.empty:
        #     plot_std_boxplot(desired_rows[5], filename)

# def plot_std_boxplot(std_series, title_suffix):
#     plt.figure(figsize=(6, 4))
#     plt.boxplot(std_series, vert=False, patch_artist=True,
#                 boxprops=dict(facecolor='lightblue', color='blue'),
#                 medianprops=dict(color='red'))
#     plt.title(f"Box Plot of Std - {title_suffix}")
#     plt.xlabel("Standard Deviation")
#     plt.tight_layout()
#     plt.savefig(f"std_boxplot_{title_suffix.replace('.csv', '')}.png")
#     plt.close()

# Example usage:
# print('Adience_256x256_resnet50_imagenet_noisy_dldl_v2_clean_corrected----------------------------------')
# process_csvs_with_iqr("../facebase/data/Adience_256x256_resnet50_imagenet_noisy_dldl_v2_clean_corrected")
# print('Adience_256x256_resnet50_imagenet_noisy_dldl_v2_nr2_corrected------------------------------------')
# process_csvs_with_iqr("../facebase/data/Adience_256x256_resnet50_imagenet_noisy_dldl_v2_nr2_corrected")
print('Adience_256x256_resnet50_imagenet_noisy_dldl_v2_corrected----------------------------------------')
process_csvs_with_iqr("../facebase/data/Adience_256x256_resnet50_imagenet_noisy_dldl_v2_corrected")


Adience_256x256_resnet50_imagenet_noisy_dldl_v2_corrected----------------------------------------
File: data_split0.csv
Outliers: 292 / 50 / 342
Total desired rows: 10954 / 3069 / 14023
Percentage of outliers: 2.67% / 1.63% / 2.44%

File: data_split1.csv
Outliers: 269 / 108 / 377
Total desired rows: 11333 / 3300 / 14633
Percentage of outliers: 2.37% / 3.27% / 2.58%

File: data_split2.csv
Outliers: 252 / 67 / 319
Total desired rows: 11128 / 3274 / 14402
Percentage of outliers: 2.26% / 2.05% / 2.21%

File: data_split3.csv
Outliers: 243 / 117 / 360
Total desired rows: 10048 / 4380 / 14428
Percentage of outliers: 2.42% / 2.67% / 2.50%

File: data_split4.csv
Outliers: 225 / 85 / 310
Total desired rows: 9643 / 3679 / 13322
Percentage of outliers: 2.33% / 2.31% / 2.33%

