In [185]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data50.csv", index_col=0)
df2 = pd.read_csv("data20.csv", index_col=0)

# Show the empty DataFrame
data = df


In [186]:
# Replace NaN values with 0
data_filled = data.fillna(0)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data_filled.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,round,replicates,duplicates,distortion,m1,m2,m3,date
0,-1,_21,100.0,1,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024
1,-1,_21,8.0,1,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024
2,-1,_21,1.0,0,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024
3,-1,_21,0.1,0,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024
4,-1,_21,100.0,0,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024
...,...,...,...,...,...,...,...,...
2911,-12,_50,100.0,1,0.997545,0.817572,0.898636,Thu_Feb_29_00_22_02_2024
2912,-12,_50,0.1,1,1.000000,0.783333,0.878505,Thu_Feb_29_00_21_57_2024
2913,-12,_50,1.0,1,0.987113,0.786448,0.875429,Thu_Feb_29_15_18_15_2024
2914,-12,_50,0.1,0,0.967742,0.952381,0.960000,Thu_Feb_29_00_21_41_2024


In [187]:
# Remove "_" from the 'replicates' column and convert to numeric if possible
data_filled['replicates'] = data_filled['replicates'].astype(str).str.replace("_", "")
data_filled['replicates'] = pd.to_numeric(data_filled['replicates'], errors='coerce').fillna(0)

# Make 'round' and 'split' values positive
data_filled['round'] = data_filled['round'].abs()


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,round,replicates,duplicates,distortion,precision,recall,f1score,date
0,1,21,100.0,1,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024
1,1,21,8.0,1,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024
2,1,21,1.0,0,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024
3,1,21,0.1,0,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024
4,1,21,100.0,0,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024
...,...,...,...,...,...,...,...,...
2911,12,50,100.0,1,0.997545,0.817572,0.898636,Thu_Feb_29_00_22_02_2024
2912,12,50,0.1,1,1.000000,0.783333,0.878505,Thu_Feb_29_00_21_57_2024
2913,12,50,1.0,1,0.987113,0.786448,0.875429,Thu_Feb_29_15_18_15_2024
2914,12,50,0.1,0,0.967742,0.952381,0.960000,Thu_Feb_29_00_21_41_2024


In [188]:
# Define a function to map 'round' values to model names
def map_round_to_model(round_number):
    if 1 <= round_number <= 3:
        return "Both"
    elif 4 <= round_number <= 6:
        return "No Diri"
    elif 7 <= round_number <= 9:
        return "None"
    elif 10 <= round_number <= 12:
        return "No Empirical"
    else:
        return "Unknown"  # For any round numbers outside the specified ranges

# Apply the mapping function to the 'round' column to create the new 'model' column
data_renamed['model'] = data_renamed['round'].apply(map_round_to_model)



# Define a function to map 'round' values to prior categories
def map_round_to_prior(round_number):
    if round_number in [1, 4, 7, 10]:
        return "Pitman"
    elif round_number in [2, 5, 8, 11]:
        return "uniform"
    elif round_number in [3, 6, 9, 12]:
        return "Bounded NBD"
    else:
        return "Unknown"  # For any round numbers outside the specified or considered ranges

# Apply the mapping function to the 'round' column to create the new 'prior' column
data_renamed['prior'] = data_renamed['round'].apply(map_round_to_prior)

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)

def map_iter_to_num(iter):
    if iter == '10k':
        return 10000
    elif iter == '50k':
        return 50000
    else:
        return 0  # For any distortion values outside the specified values


# Select only the specified columns
data_selected_columns = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates', "date"]]

# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level',
                           'replicates', 'precision', 'recall', 'f1score', "date"]]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score,date
0,Both,Pitman,Very High,High,21,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024
1,Both,Pitman,High,High,21,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024
2,Both,Pitman,Medium,Low,21,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024
3,Both,Pitman,Low,Low,21,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024
4,Both,Pitman,Very High,Low,21,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024
...,...,...,...,...,...,...,...,...,...
2911,No Empirical,Bounded NBD,Very High,High,50,0.997545,0.817572,0.898636,Thu_Feb_29_00_22_02_2024
2912,No Empirical,Bounded NBD,Low,High,50,1.000000,0.783333,0.878505,Thu_Feb_29_00_21_57_2024
2913,No Empirical,Bounded NBD,Medium,High,50,0.987113,0.786448,0.875429,Thu_Feb_29_15_18_15_2024
2914,No Empirical,Bounded NBD,Low,Low,50,0.967742,0.952381,0.960000,Thu_Feb_29_00_21_41_2024


In [189]:
data_cleaned = data_final

# Finding duplicates and keeping them to show
duplicates_to_show = data_cleaned[data_cleaned.duplicated(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates'], keep='first')]
duplicates_to_show2 = data_cleaned[data_cleaned.duplicated(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates'], keep='last')]

duplicates_to_show3 = pd.concat([duplicates_to_show, duplicates_to_show2])

# Sorting the duplicates for better readability
duplicates_to_show3 = duplicates_to_show3.sort_values(by=['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates'])

pd.set_option('display.max_rows', 10)

duplicates_to_show3 = duplicates_to_show3[(duplicates_to_show3['precision'] >= 0.3) & (duplicates_to_show3['recall'] >= 0.3)]

data_cleaned.drop_duplicates(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates'], keep=False)

# duplicates_to_show3


Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score,date
0,Both,Pitman,Very High,High,21,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024
1,Both,Pitman,High,High,21,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024
2,Both,Pitman,Medium,Low,21,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024
3,Both,Pitman,Low,Low,21,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024
4,Both,Pitman,Very High,Low,21,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024
...,...,...,...,...,...,...,...,...,...
2911,No Empirical,Bounded NBD,Very High,High,50,0.997545,0.817572,0.898636,Thu_Feb_29_00_22_02_2024
2912,No Empirical,Bounded NBD,Low,High,50,1.000000,0.783333,0.878505,Thu_Feb_29_00_21_57_2024
2913,No Empirical,Bounded NBD,Medium,High,50,0.987113,0.786448,0.875429,Thu_Feb_29_15_18_15_2024
2914,No Empirical,Bounded NBD,Low,Low,50,0.967742,0.952381,0.960000,Thu_Feb_29_00_21_41_2024


In [190]:
data_cleaned[(data_cleaned['precision'] <= 0.3) | (data_cleaned['recall'] <= 0.3)]

Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score,date
41,Both,Pitman,Very High,High,25,0.002955,1.000000,0.005893,Wed_Feb_28_11_14_53_2024
43,Both,Pitman,Low,High,25,0.000123,1.000000,0.000245,Wed_Feb_28_11_14_46_2024
48,Both,Pitman,Medium,High,25,0.000975,1.000000,0.001948,Wed_Feb_28_11_14_48_2024
51,Both,Pitman,High,High,25,0.002608,1.000000,0.005202,Wed_Feb_28_11_14_51_2024
54,Both,Pitman,High,High,26,0.002608,1.000000,0.005202,Wed_Feb_28_11_15_31_2024
...,...,...,...,...,...,...,...,...,...
1925,,uniform,Low,High,47,0.258883,0.850000,0.396887,Wed_Feb_28_23_05_35_2024
1935,,uniform,Low,High,48,0.265625,0.850000,0.404762,Wed_Feb_28_23_06_36_2024
1940,,uniform,Low,High,49,0.274193,0.850000,0.414634,Wed_Feb_28_23_07_00_2024
1943,,uniform,Low,Low,49,0.298507,0.952381,0.454545,Wed_Feb_28_23_06_43_2024


In [191]:
data_cleaned = data_cleaned[(data_cleaned['precision'] >= 0.2) & (data_cleaned['recall'] >= 0.2)]
data_cleaned = data_cleaned.drop_duplicates(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates'], keep="first")
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.shape

(2880, 9)

In [192]:
data_cleaned_20 = pd.read_csv("clean_data20.csv", index_col=0)
data_cleaned = pd.concat([data_cleaned, data_cleaned_20])
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.to_csv("clean_data50.csv")

In [193]:
# # Select the columns of interest
# columns_of_interest = ['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates']
# df_selected = data_cleaned[columns_of_interest]


# # Option 2: Count across selected columns (if you want a total count regardless of the column)
# total_count = df_selected[['model', 'prior', 'replicates']].value_counts() # Sorting by index (the number) for better readability
# print("Total count across selected columns:")
# print(total_count)

In [194]:
data_cleaned = data_cleaned.replace("Bounded NBD", "BNBD4")
data_cleaned

Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score,date,num_iter
0,Both,Pitman,Very High,High,21,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024,
1,Both,Pitman,High,High,21,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024,
2,Both,Pitman,Medium,Low,21,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024,
3,Both,Pitman,Low,Low,21,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024,
4,Both,Pitman,Very High,Low,21,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024,
...,...,...,...,...,...,...,...,...,...,...
4795,No Empirical,BNBD4,Medium,High,16,0.984456,0.780288,0.870561,,50000.0
4796,No Empirical,BNBD4,Medium,High,17,0.994609,0.757700,0.860140,,50000.0
4797,No Empirical,BNBD4,Medium,High,18,0.991758,0.741273,0.848414,,50000.0
4798,No Empirical,BNBD4,Medium,High,19,0.981959,0.782341,0.870857,,50000.0


In [195]:
data_BNBD = pd.read_csv("clean_data_BNBD.csv", index_col=0)
data_BNBD['prior'] = 'BNBD' + data_BNBD['cluster_size'].astype(str) 
data_BNBD.drop('cluster_size', axis=1, inplace=True)
data_BNBD

Unnamed: 0,model,num_iter,duplicates_level,distortion_level,precision,recall,f1score,date,prior
0,Both,10000,High,Low,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024,BNBD2
3,Both,10000,High,Low,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024,BNBD2
5,Both,10000,High,Low,1.000000,0.337859,0.505075,Wed_Mar__6_12_56_55_2024,BNBD2
7,Both,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_14_05_23_2024,BNBD2
20,Both,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_13_31_31_2024,BNBD2
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,50000,Medium,High,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024,BNBD10
3823,No Empirical,50000,Medium,High,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024,BNBD10
3824,No Empirical,50000,Medium,High,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024,BNBD10
3825,No Empirical,50000,Medium,High,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024,BNBD10


In [196]:
data_cleaned = pd.concat([data_cleaned, data_BNBD])
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned

Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score,date,num_iter
0,Both,Pitman,Very High,High,21.0,1.000000,0.794769,0.885650,Fri_Feb__9_08_51_26_2024,
1,Both,Pitman,High,High,21.0,1.000000,0.688249,0.815341,Fri_Feb__9_08_51_30_2024,
2,Both,Pitman,Medium,Low,21.0,0.997807,0.919192,0.956888,Tue_Feb__6_20_11_45_2024,
3,Both,Pitman,Low,Low,21.0,0.967742,0.952381,0.960000,Tue_Feb__6_20_11_41_2024,
4,Both,Pitman,Very High,Low,21.0,1.000000,0.921762,0.959288,Tue_Feb__6_20_11_52_2024,
...,...,...,...,...,...,...,...,...,...,...
7995,No Empirical,BNBD10,Medium,High,,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024,50000.0
7996,No Empirical,BNBD10,Medium,High,,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024,50000.0
7997,No Empirical,BNBD10,Medium,High,,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024,50000.0
7998,No Empirical,BNBD10,Medium,High,,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024,50000.0


In [198]:
pd.set_option('display.max_rows', 10)

In [199]:
# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_cleaned.groupby(['model', 'prior', 'duplicates_level', 'distortion_level'])

# Compute mean
mean_values = grouped_data[['precision', 'recall', 'f1score']].mean().reset_index()

# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
mean_values.rename(columns={
    'precision': 'precision_mean', 
    'recall': 'recall_mean', 
    'f1score': 'f1score_mean'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
mean_values.head(), std_dev_values.head()

# Define the metrics you want to calculate the bounds for
metrics = ['precision', 'recall', 'f1score']

# Initialize an empty DataFrame to store the bounds
bounds_df = pd.DataFrame()

# Calculate the lower and upper bounds for each metric
for metric in metrics:
    # Calculate the quantiles for the current metric
    bounds = grouped_data[metric].quantile([0.025, 0.975]).unstack(level=-1)
    
    # Rename columns to reflect the metric and bound type
    bounds.columns = [f'{metric}_lower_bound', f'{metric}_upper_bound']
    
    # If bounds_df is empty, initialize it with the current bounds
    if bounds_df.empty:
        bounds_df = bounds.reset_index()
    else:
        # Otherwise, merge the new bounds into the existing DataFrame
        bounds_df = bounds_df.merge(bounds.reset_index(), on=['model', 'prior', 'duplicates_level', 'distortion_level'])

# Now you have a DataFrame with lower and upper bounds for each metric
bounds_df



Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD10,High,High,0.996025,1.000000,0.690028,0.760012,0.816115,0.862860
1,Both,BNBD10,High,Low,1.000000,1.000000,0.934066,0.940535,0.965717,0.969356
2,Both,BNBD10,Low,High,1.000000,1.000000,0.733333,0.750000,0.846154,0.857143
3,Both,BNBD10,Low,Low,0.967742,1.000000,0.952381,0.952381,0.960000,0.975610
4,Both,BNBD10,Medium,High,0.988872,0.991758,0.733522,0.742864,0.842288,0.848689
...,...,...,...,...,...,...,...,...,...,...
155,,uniform,Low,Low,0.292292,0.323060,0.952381,0.952381,0.447303,0.482462
156,,uniform,Medium,High,0.974249,0.988896,0.847485,0.864476,0.908617,0.922111
157,,uniform,Medium,Low,0.997203,1.000000,0.918636,0.925253,0.955857,0.961175
158,,uniform,Very High,High,0.999331,1.000000,0.819215,0.834591,0.900625,0.909838


In [200]:
mean_values.shape

(160, 7)

In [201]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data = pd.merge(mean_values, bounds_df,
                         on=['model', 'prior', 'duplicates_level', 'distortion_level'])


pd.set_option('display.max_rows', None)


# Display the combined dataframe
combined_data


Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_mean,recall_mean,f1score_mean,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,BNBD10,High,High,0.999202,0.709289,0.829368,0.996025,1.0,0.690028,0.760012,0.816115,0.86286
1,Both,BNBD10,High,Low,0.999949,0.936917,0.967407,1.0,1.0,0.934066,0.940535,0.965717,0.969356
2,Both,BNBD10,Low,High,1.0,0.746,0.854501,1.0,1.0,0.733333,0.75,0.846154,0.857143
3,Both,BNBD10,Low,Low,0.978562,0.952381,0.965272,0.967742,1.0,0.952381,0.952381,0.96,0.97561
4,Both,BNBD10,Medium,High,0.989709,0.738439,0.845804,0.988872,0.991758,0.733522,0.742864,0.842288,0.848689
5,Both,BNBD10,Medium,Low,0.999476,0.923556,0.960014,0.995628,1.0,0.919192,0.929293,0.956888,0.963351
6,Both,BNBD10,Very High,High,0.998466,0.785687,0.8793,0.997407,1.0,0.772938,0.828672,0.871601,0.905314
7,Both,BNBD10,Very High,Low,1.0,0.921683,0.959245,1.0,1.0,0.91979,0.923077,0.958219,0.96
8,Both,BNBD2,High,High,0.992885,0.294404,0.454145,0.989173,0.994638,0.291767,0.297982,0.450617,0.458081
9,Both,BNBD2,High,Low,1.0,0.337045,0.504163,1.0,1.0,0.335463,0.338658,0.502392,0.505967


In [202]:
pd.set_option('display.max_rows', 10)

In [203]:
combined_data.to_csv("plot_data_BNBD.csv")