In [81]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data_5k.csv", index_col=0)

# Show the empty DataFrame
data = df

data


Unnamed: 0,model,prior,duplicates,distortion,m1,m2,m3,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1054,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1055,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1056,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1057,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [82]:
data.fillna("Neither", inplace=True)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,model,prior,duplicates,distortion,m1,m2,m3,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1054,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1055,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1056,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1057,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [83]:


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,model,prior,duplicates,distortion,precision,recall,f1score,date
0,Both,Pitman,8.0,0,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,8.0,1,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,100.0,0,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,0.1,1,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,0.1,0,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1054,No Empiri,BNBD4,0.1,0,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1055,No Empiri,BNBD4,0.1,1,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1056,No Empiri,BNBD4,1.0,0,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1057,No Empiri,BNBD4,100.0,1,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [84]:

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)


# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level',
                            'precision', 'recall', 'f1score', "date"]]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,Pitman,High,Low,1.000000,0.952612,0.975731,Wed_Mar__6_21_53_11_2024
1,Both,Pitman,High,High,0.998676,0.683384,0.811480,Sat_Mar__9_05_49_38_2024
2,Both,Pitman,Very High,Low,1.000000,0.955298,0.977138,Sat_Mar__9_05_49_23_2024
3,Both,Pitman,Low,High,0.973333,0.698565,0.813370,Wed_Mar__6_21_53_19_2024
4,Both,Pitman,Low,Low,0.975490,0.956731,0.966019,Sat_Mar__9_05_49_09_2024
...,...,...,...,...,...,...,...,...
1054,No Empiri,BNBD4,Low,Low,0.980296,0.956731,0.968370,Fri_Mar_15_21_23_04_2024
1055,No Empiri,BNBD4,Low,High,0.961783,0.722488,0.825137,Wed_Mar_13_00_52_22_2024
1056,No Empiri,BNBD4,Medium,Low,0.997576,0.956764,0.976744,Wed_Mar_13_00_52_12_2024
1057,No Empiri,BNBD4,Very High,High,0.997732,0.821724,0.901215,Thu_Mar_14_11_41_46_2024


In [85]:
pd.set_option("display.max_rows", 10)

In [86]:
data_final.value_counts(subset=['model', "prior", 'duplicates_level', 'distortion_level'])

model      prior    duplicates_level  distortion_level
Both       Pitman   Medium            Low                 16
                    Low               Low                 15
                                      High                14
No Empiri  BNBD4    High              Low                 13
                    Very High         Low                 13
                                                          ..
Neither    Uniform  Low               Low                  9
                    Very High         High                 9
                    Low               High                 9
Both       Pitman   Very High         High                 8
                    Medium            High                 6
Name: count, Length: 96, dtype: int64

In [87]:
# Specify the subset of columns you want to consider for finding unique rows
subset_columns = ['model', 'prior', 'duplicates_level', 'distortion_level']

# Find duplicates across the specified subset of columns, marking all duplicates as True
duplicates = data_final.duplicated(subset=subset_columns, keep=False)

# Invert the boolean Series to identify unique rows
unique_rows = ~duplicates

# Filter out the unique rows, keeping only those that have duplicates in the subset
data_final = data_final[~unique_rows]



In [88]:
pd.set_option("display.max_rows", None)
data_final = data_final.groupby(subset_columns, as_index=False, group_keys=False).apply(lambda x: x.head(50))
data_final.value_counts(subset=['model', 'prior', 'duplicates_level', 'distortion_level'])
data_final.shape

(1059, 8)

In [89]:
pd.set_option("display.max_rows", 10)
data_final = data_final.sort_values(by=subset_columns)
data_final

Unnamed: 0,model,prior,duplicates_level,distortion_level,precision,recall,f1score,date
712,Both,BNBD4,High,High,0.995028,0.846526,0.914789,Wed_Mar__6_22_03_27_2024
715,Both,BNBD4,High,High,0.992545,0.844713,0.912682,Thu_Mar__7_08_20_15_2024
720,Both,BNBD4,High,High,0.994297,0.842749,0.912272,Mon_Feb_12_10_11_16_2024
726,Both,BNBD4,High,High,0.992545,0.844713,0.912682,Wed_Mar__6_22_03_43_2024
741,Both,BNBD4,High,High,0.993772,0.843656,0.912582,Thu_Mar__7_14_27_15_2024
...,...,...,...,...,...,...,...,...
681,No Empiri,Uniform,Very High,Low,1.000000,0.953029,0.975950,Sat_Mar_16_01_31_07_2024
684,No Empiri,Uniform,Very High,Low,1.000000,0.952896,0.975880,Thu_Mar_14_15_17_13_2024
686,No Empiri,Uniform,Very High,Low,1.000000,0.953429,0.976160,Wed_Mar__6_22_28_17_2024
691,No Empiri,Uniform,Very High,Low,1.000000,0.954097,0.976509,Thu_Mar__7_15_03_53_2024


In [90]:
data_final.to_csv("clean_data_5k.csv")

In [91]:
data_cleaned = data_final

# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_cleaned.groupby(['model', 'prior', 'duplicates_level', 'distortion_level'])

# Compute mean
median_values = grouped_data[['precision', 'recall', 'f1score']].median().reset_index()

# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
median_values.rename(columns={
    'precision': 'precision_median', 
    'recall': 'recall_median', 
    'f1score': 'f1score_median'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
median_values.head(), std_dev_values.head()

# Define the metrics you want to calculate the bounds for
metrics = ['precision', 'recall', 'f1score']

# Initialize an empty DataFrame to store the bounds
bounds_df = pd.DataFrame()

# Calculate the lower and upper bounds for each metric
for metric in metrics:
    # Calculate the quantiles for the current metric
    bounds = grouped_data[metric].quantile([0.025, 0.975]).unstack(level=-1)
    
    # Rename columns to reflect the metric and bound type
    bounds.columns = [f'{metric}_lower_bound', f'{metric}_upper_bound']
    
    # If bounds_df is empty, initialize it with the current bounds
    if bounds_df.empty:
        bounds_df = bounds.reset_index()
    else:
        # Otherwise, merge the new bounds into the existing DataFrame
        bounds_df = bounds_df.merge(bounds.reset_index(), on=['model', 'prior', 'duplicates_level', 'distortion_level'])

# Now you have a DataFrame with lower and upper bounds for each metric
bounds_df



Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD4,High,High,0.992187,0.995024,0.842791,0.846745,0.911613,0.914510
1,Both,BNBD4,High,Low,0.999841,1.000000,0.950053,0.951552,0.974387,0.975099
2,Both,BNBD4,Low,High,0.920567,0.922222,0.776316,0.794258,0.842309,0.853470
3,Both,BNBD4,Low,Low,0.971870,0.985054,0.947115,0.951923,0.961028,0.967566
4,Both,BNBD4,Medium,High,0.960620,0.968584,0.817091,0.823982,0.884090,0.888989
...,...,...,...,...,...,...,...,...,...,...
91,No Empiri,Uniform,Low,Low,0.192508,0.196146,0.953245,0.956731,0.320522,0.325549
92,No Empiri,Uniform,Medium,High,0.942026,0.948844,0.823543,0.831984,0.880183,0.885959
93,No Empiri,Uniform,Medium,Low,0.993034,0.995658,0.956892,0.959891,0.974977,0.977321
94,No Empiri,Uniform,Very High,High,0.999835,1.000000,0.805945,0.810015,0.892546,0.895037


In [92]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data = pd.merge(median_values, bounds_df,
                         on=['model', 'prior', 'duplicates_level', 'distortion_level'])

combined_data = combined_data.replace({"None": "Neither"})


# Display the combined dataframe
combined_data

Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_median,recall_median,f1score_median,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD4,High,High,0.993327,0.844562,0.912848,0.992187,0.995024,0.842791,0.846745,0.911613,0.914510
1,Both,BNBD4,High,Low,1.000000,0.951022,0.974896,0.999841,1.000000,0.950053,0.951552,0.974387,0.975099
2,Both,BNBD4,Low,High,0.921348,0.784689,0.847545,0.920567,0.922222,0.776316,0.794258,0.842309,0.853470
3,Both,BNBD4,Low,Low,0.975370,0.951923,0.963504,0.971870,0.985054,0.947115,0.951923,0.961028,0.967566
4,Both,BNBD4,Medium,High,0.963780,0.819611,0.886996,0.960620,0.968584,0.817091,0.823982,0.884090,0.888989
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,No Empiri,Uniform,Low,Low,0.194132,0.956731,0.322633,0.192508,0.196146,0.953245,0.956731,0.320522,0.325549
92,No Empiri,Uniform,Medium,High,0.945300,0.827475,0.882455,0.942026,0.948844,0.823543,0.831984,0.880183,0.885959
93,No Empiri,Uniform,Medium,Low,0.994932,0.957927,0.976199,0.993034,0.995658,0.956892,0.959891,0.974977,0.977321
94,No Empiri,Uniform,Very High,High,1.000000,0.808247,0.893890,0.999835,1.000000,0.805945,0.810015,0.892546,0.895037


In [93]:
combined_data.to_csv("plot_data_5k.csv")