In [237]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data_5k.csv", index_col=0)

# Show the empty DataFrame
data = df

data


Unnamed: 0,model,prior,duplicates,distortion,m1,m2,m3,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1067,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1068,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1069,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1070,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [238]:
data.fillna("Neither", inplace=True)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,model,prior,duplicates,distortion,m1,m2,m3,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1067,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1068,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1069,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1070,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [239]:


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,model,prior,duplicates,distortion,precision,recall,f1score,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1067,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1068,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1069,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1070,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [240]:

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)


# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level',
                            'precision', 'recall', 'f1score', "date"]]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,Pitman,High,Low,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,High,High,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,Very High,Low,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,Low,High,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,Low,Low,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1067,No Empiri,BNBD4,Low,Low,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1068,No Empiri,BNBD4,Low,High,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1069,No Empiri,BNBD4,Medium,Low,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1070,No Empiri,BNBD4,Very High,High,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [241]:
pd.set_option("display.max_rows", 10)

In [242]:
data_final.value_counts(subset=['model', "prior", 'duplicates_level', 'distortion_level'])

model      prior    duplicates_level  distortion_level
Both       Pitman   Medium            Low                 16
                    Low               Low                 15
                                      High                14
No Empiri  Uniform  Low               Low                 13
           BNBD4    Very High         Low                 13
                                                          ..
Neither    Uniform  Very High         High                 9
                    Low               High                 9
                                      Low                  9
                    Medium            High                 9
           BNBD4    Low               High                 9
Name: count, Length: 96, dtype: int64

In [243]:
columns_to_consider = ['model', "prior", 'duplicates_level', 'distortion_level']

# Group by these columns, and for each group, keep only the first 3 appearances
data_final = data_final.groupby(columns_to_consider).head(10)

data_final.value_counts(subset=['model', "prior", 'duplicates_level', 'distortion_level'])

model    prior    duplicates_level  distortion_level
Both     BNBD4    High              High                10
                                    Low                 10
No Diri  Uniform  Very High         High                10
                  Medium            Low                 10
                                    High                10
                                                        ..
Neither  Uniform  Low               High                 9
                                    Low                  9
                  Medium            High                 9
         BNBD4    Low               High                 9
         Uniform  Very High         High                 9
Name: count, Length: 96, dtype: int64

In [244]:
import pandas as pd
import numpy as np

# Function to append rows if a group has less than 3 appearances
def append_missing_rows(group):
    if len(group) < 10:
        # Calculate median (or however you want to define this operation)
        median_row = group.median(numeric_only=True)
        # print(group, median_row)
        # Since median_row will only contain numeric columns, ensure we have all needed columns
        for col in group.columns:
            if col not in median_row:
                median_row[col] = group[col].iloc[0]  # Use the first value for non-numeric columns
        
        # Duplicate the median row as needed to reach 3 appearances
        missing_rows_count = 10 - len(group)
        for _ in range(missing_rows_count):
            group = pd.concat([group, pd.DataFrame([median_row], columns=group.columns)])
    
    return group

# Apply the function to each group and concatenate the results
data_final = pd.concat(
    [append_missing_rows(group) for _, group in data_final.groupby(columns_to_consider)]
).reset_index(drop=True)

data_final.value_counts(subset=['model', "prior", 'duplicates_level', 'distortion_level'])


       model  prior duplicates_level distortion_level  precision    recall  \
189  Neither  BNBD4              Low             High   0.979866  0.698565   
198  Neither  BNBD4              Low             High   0.986487  0.698565   
199  Neither  BNBD4              Low             High   0.986487  0.698565   
206  Neither  BNBD4              Low             High   0.979866  0.698565   
219  Neither  BNBD4              Low             High   0.986487  0.698565   
232  Neither  BNBD4              Low             High   1.000000  0.750000   
235  Neither  BNBD4              Low             High   0.979866  0.698565   
247  Neither  BNBD4              Low             High   0.986395  0.693780   
260  Neither  BNBD4              Low             High   0.980000  0.703349   

      f1score                      date  
189  0.815643  Wed_Mar__6_21_55_50_2024  
198  0.817927  Sat_Mar__9_07_15_11_2024  
199  0.817927  Tue_Feb_20_10_07_50_2024  
206  0.815643  Wed_Mar__6_21_56_01_2024  
219  0.81

model      prior    duplicates_level  distortion_level
Both       BNBD4    High              High                10
                                      Low                 10
No Diri    Uniform  Very High         High                10
                    Medium            Low                 10
                                      High                10
                                                          ..
Neither    BNBD4    Medium            Low                 10
                                      High                10
                    Low               Low                 10
                                      High                10
No Empiri  Uniform  Very High         Low                 10
Name: count, Length: 96, dtype: int64

In [245]:
pd.set_option("display.max_rows", 10)

In [246]:
# Specify the subset of columns you want to consider for finding unique rows
subset_columns = ['model', 'prior', 'duplicates_level', 'distortion_level']

# Find duplicates across the specified subset of columns, marking all duplicates as True
duplicates = data_final.duplicated(subset=subset_columns, keep=False)

# Invert the boolean Series to identify unique rows
unique_rows = ~duplicates

# Filter out the unique rows, keeping only those that have duplicates in the subset
data_final = data_final[~unique_rows]



In [247]:
pd.set_option("display.max_rows", None)
data_final = data_final.groupby(subset_columns, as_index=False, group_keys=False).apply(lambda x: x.head(50))
data_final.value_counts(subset=['model', 'prior', 'duplicates_level', 'distortion_level'])
data_final.shape

(960, 8)

In [248]:
pd.set_option("display.max_rows", 10)
data_final = data_final.sort_values(by=subset_columns)
data_final

Unnamed: 0,model,prior,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,BNBD4,High,High,0.995028,0.846526,0.914789,Wed_Mar__6_22_03_27_2024
1,Both,BNBD4,High,High,0.992545,0.844713,0.912682,Thu_Mar__7_08_20_15_2024
2,Both,BNBD4,High,High,0.994297,0.842749,0.912272,Mon_Feb_12_10_11_16_2024
3,Both,BNBD4,High,High,0.992545,0.844713,0.912682,Wed_Mar__6_22_03_43_2024
4,Both,BNBD4,High,High,0.993772,0.843656,0.912582,Thu_Mar__7_14_27_15_2024
...,...,...,...,...,...,...,...,...
955,No Empiri,Uniform,Very High,Low,1.000000,0.952095,0.975460,Thu_Mar__7_15_03_46_2024
956,No Empiri,Uniform,Very High,Low,1.000000,0.953029,0.975950,Wed_Mar__6_22_27_51_2024
957,No Empiri,Uniform,Very High,Low,1.000000,0.953029,0.975950,Sat_Mar_16_01_31_07_2024
958,No Empiri,Uniform,Very High,Low,1.000000,0.952896,0.975880,Thu_Mar_14_15_17_13_2024


In [249]:
data_final.to_csv("clean_data_5k.csv")

In [250]:
data_cleaned = data_final

# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_cleaned.groupby(['model', 'prior', 'duplicates_level', 'distortion_level'])

# Compute mean
median_values = grouped_data[['precision', 'recall', 'f1score']].median().reset_index()

# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
median_values.rename(columns={
    'precision': 'precision_median', 
    'recall': 'recall_median', 
    'f1score': 'f1score_median'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
median_values.head(), std_dev_values.head()

# Define the metrics you want to calculate the bounds for
metrics = ['precision', 'recall', 'f1score']

# Initialize an empty DataFrame to store the bounds
bounds_df = pd.DataFrame()

# Calculate the lower and upper bounds for each metric
for metric in metrics:
    # Calculate the quantiles for the current metric
    bounds = grouped_data[metric].quantile([0.025, 0.975]).unstack(level=-1)
    
    # Rename columns to reflect the metric and bound type
    bounds.columns = [f'{metric}_lower_bound', f'{metric}_upper_bound']
    
    # If bounds_df is empty, initialize it with the current bounds
    if bounds_df.empty:
        bounds_df = bounds.reset_index()
    else:
        # Otherwise, merge the new bounds into the existing DataFrame
        bounds_df = bounds_df.merge(bounds.reset_index(), on=['model', 'prior', 'duplicates_level', 'distortion_level'])

# Now you have a DataFrame with lower and upper bounds for each metric
bounds_df



Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD4,High,High,0.992223,0.994906,0.842783,0.846322,0.911603,0.914512
1,Both,BNBD4,High,Low,0.999841,1.000000,0.950023,0.951552,0.974371,0.975099
2,Both,BNBD4,Low,High,0.920556,0.922222,0.776196,0.794258,0.842234,0.853470
3,Both,BNBD4,Low,Low,0.971637,0.985058,0.947115,0.951923,0.961018,0.967684
4,Both,BNBD4,Medium,High,0.960531,0.968642,0.817044,0.824029,0.884073,0.889011
...,...,...,...,...,...,...,...,...,...,...
91,No Empiri,Uniform,Low,Low,0.192498,0.195823,0.956731,0.956731,0.320509,0.325104
92,No Empiri,Uniform,Medium,High,0.941981,0.948851,0.823520,0.831788,0.880174,0.885234
93,No Empiri,Uniform,Medium,Low,0.994321,0.995658,0.956869,0.959914,0.975641,0.977356
94,No Empiri,Uniform,Very High,High,0.999835,1.000000,0.805935,0.810025,0.892540,0.895043


In [251]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data = pd.merge(median_values, bounds_df,
                         on=['model', 'prior', 'duplicates_level', 'distortion_level'])

combined_data = combined_data.replace({"None": "Neither"})


# Display the combined dataframe
combined_data

Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_median,recall_median,f1score_median,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD4,High,High,0.993327,0.844562,0.912682,0.992223,0.994906,0.842783,0.846322,0.911603,0.914512
1,Both,BNBD4,High,Low,1.000000,0.951022,0.974896,0.999841,1.000000,0.950023,0.951552,0.974371,0.975099
2,Both,BNBD4,Low,High,0.921348,0.784689,0.847545,0.920556,0.922222,0.776196,0.794258,0.842234,0.853470
3,Both,BNBD4,Low,Low,0.975370,0.951923,0.963504,0.971637,0.985058,0.947115,0.951923,0.961018,0.967684
4,Both,BNBD4,Medium,High,0.964633,0.820768,0.887002,0.960531,0.968642,0.817044,0.824029,0.884073,0.889011
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,No Empiri,Uniform,Low,Low,0.193957,0.956731,0.322529,0.192498,0.195823,0.956731,0.956731,0.320509,0.325104
92,No Empiri,Uniform,Medium,High,0.945078,0.827012,0.881939,0.941981,0.948851,0.823520,0.831788,0.880174,0.885234
93,No Empiri,Uniform,Medium,Low,0.995166,0.958159,0.976309,0.994321,0.995658,0.956869,0.959914,0.975641,0.977356
94,No Empiri,Uniform,Very High,High,0.999918,0.808513,0.894053,0.999835,1.000000,0.805935,0.810025,0.892540,0.895043


In [252]:
combined_data.to_csv("plot_data_5k.csv")