In [26]:
import pandas as pd

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data.csv", index_col=0)

# Show the empty DataFrame
data = df


In [27]:
# Replace NaN values with 0
data_filled = data.fillna(0)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data_filled.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,round,split,replicates,duplicates,distortion,m1,m2,m3
0,-1,0.0,0,8.0,1,1.000000,0.745803,0.854396
1,-1,0.0,0,100.0,1,0.997500,0.802817,0.889632
2,-1,0.0,0,0.1,0,0.967742,0.952381,0.960000
3,-1,0.0,0,1.0,0,1.000000,0.919192,0.957895
4,-1,0.0,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...
1907,-12,-2.0,_19,1.0,1,0.987047,0.782341,0.872852
1908,-12,-2.0,_19,0.1,1,1.000000,0.783333,0.878505
1909,-12,-2.0,_19,8.0,0,1.000000,0.937700,0.967848
1910,-12,-2.0,_19,8.0,1,0.998973,0.777778,0.874607


In [28]:
# Remove "_" from the 'replicates' column and convert to numeric if possible
data_filled['replicates'] = data_filled['replicates'].astype(str).str.replace("_", "")
data_filled['replicates'] = pd.to_numeric(data_filled['replicates'], errors='coerce').fillna(0)

# Make 'round' and 'split' values positive
data_filled['round'] = data_filled['round'].abs()
data_filled['split'] = data_filled['split'].abs()


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,round,split,replicates,duplicates,distortion,precision,recall,f1score
0,1,0.0,0,8.0,1,1.000000,0.745803,0.854396
1,1,0.0,0,100.0,1,0.997500,0.802817,0.889632
2,1,0.0,0,0.1,0,0.967742,0.952381,0.960000
3,1,0.0,0,1.0,0,1.000000,0.919192,0.957895
4,1,0.0,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...
1907,12,2.0,19,1.0,1,0.987047,0.782341,0.872852
1908,12,2.0,19,0.1,1,1.000000,0.783333,0.878505
1909,12,2.0,19,8.0,0,1.000000,0.937700,0.967848
1910,12,2.0,19,8.0,1,0.998973,0.777778,0.874607


In [30]:
# Define a function to map 'round' values to model names
def map_round_to_model(round_number):
    if 1 <= round_number <= 3:
        return "Both"
    elif 4 <= round_number <= 6:
        return "No Diri"
    elif 7 <= round_number <= 9:
        return "None"
    elif 10 <= round_number <= 12:
        return "No Empirical"
    else:
        return "Unknown"  # For any round numbers outside the specified ranges

# Apply the mapping function to the 'round' column to create the new 'model' column
data_renamed['model'] = data_renamed['round'].apply(map_round_to_model)



# Define a function to map 'round' values to prior categories
def map_round_to_prior(round_number):
    if round_number in [1, 4, 7, 10]:
        return "Pitman"
    elif round_number in [2, 5, 8, 11]:
        return "uniform"
    elif round_number in [3, 6, 9, 12]:
        return "Bounded NBD"
    else:
        return "Unknown"  # For any round numbers outside the specified or considered ranges

# Apply the mapping function to the 'round' column to create the new 'prior' column
data_renamed['prior'] = data_renamed['round'].apply(map_round_to_prior)

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)

# Select only the specified columns
data_selected_columns = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates']]

# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 
                           'replicates', 'precision', 'recall', 'f1score']]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,replicates,precision,recall,f1score
0,Both,Pitman,High,High,0,1.000000,0.745803,0.854396
1,Both,Pitman,Very High,High,0,0.997500,0.802817,0.889632
2,Both,Pitman,Low,Low,0,0.967742,0.952381,0.960000
3,Both,Pitman,Medium,Low,0,1.000000,0.919192,0.957895
4,Both,Pitman,Low,High,0,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...
1907,No Empirical,Bounded NBD,Medium,High,19,0.987047,0.782341,0.872852
1908,No Empirical,Bounded NBD,Low,High,19,1.000000,0.783333,0.878505
1909,No Empirical,Bounded NBD,High,Low,19,1.000000,0.937700,0.967848
1910,No Empirical,Bounded NBD,High,High,19,0.998973,0.777778,0.874607


In [32]:
# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_final.groupby(['model', 'prior', 'duplicates_level', 'distortion_level'])

# Compute mean
mean_values = grouped_data[['precision', 'recall', 'f1score']].mean().reset_index()

# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
mean_values.rename(columns={
    'precision': 'precision_mean', 
    'recall': 'recall_mean', 
    'f1score': 'f1score_mean'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
mean_values.head(), std_dev_values.head()


Unnamed: 0,model,prior,duplicates_level,distortion_level,precision_mean,recall_mean,f1score_mean
0,Both,Bounded NBD,High,High,0.998371,0.751479,0.857191
1,Both,Bounded NBD,High,Low,0.999872,0.935863,0.966809
2,Both,Bounded NBD,Low,High,1.000000,0.750833,0.857600
3,Both,Bounded NBD,Low,Low,0.984426,0.952381,0.968135
4,Both,Bounded NBD,Medium,High,0.985199,0.760986,0.858368
...,...,...,...,...,...,...,...
91,,uniform,Low,Low,0.305915,0.952381,0.463033
92,,uniform,Medium,High,0.978620,0.839425,0.903667
93,,uniform,Medium,Low,0.999236,0.921515,0.958799
94,,uniform,Very High,High,0.999837,0.827264,0.905395


In [34]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data = pd.merge(mean_values, std_dev_values, 
                         on=['model', 'prior', 'duplicates_level', 'distortion_level'])

# Display the combined dataframe
combined_data.head()


In [None]:
combined_data.to_csv("clean_data.csv")

# use both and PitmanYou prior

In [12]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
both_Pit_pre_mean = []
both_Pit_pre_sd = []
both_Pit_recall_mean = []
both_Pit_recall_sd = []
both_Pit_f1score_mean = []
both_Pit_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -1) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        both_Pit_pre_mean.append(np.mean(precision_values))
        both_Pit_recall_mean.append(np.mean(recall_values))
        both_Pit_f1score_mean.append(np.mean(f1score_values))
        both_Pit_pre_sd.append(np.std(precision_values))
        both_Pit_recall_sd.append(np.std(recall_values))
        both_Pit_f1score_sd.append(np.std(f1score_values))

# use both and Uniform prior

In [13]:
both_Unif_pre_mean = []
both_Unif_pre_sd = []
both_Unif_recall_mean = []
both_Unif_recall_sd = []
both_Unif_f1score_mean = []
both_Unif_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -2) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        both_Unif_pre_mean.append(np.mean(precision_values))
        both_Unif_recall_mean.append(np.mean(recall_values))
        both_Unif_f1score_mean.append(np.mean(f1score_values))
        both_Unif_pre_sd.append(np.std(precision_values))
        both_Unif_recall_sd.append(np.std(recall_values))
        both_Unif_f1score_sd.append(np.std(f1score_values))

In [14]:
both_Unif_pre_mean

[0.41759702,
 0.99978118,
 1.0,
 1.0,
 0.49330822999999996,
 0.9795172300000001,
 0.9990920500000001,
 1.0]

# use both and Bounded NBD prior

In [15]:
both_BNBD_pre_mean = []
both_BNBD_pre_sd = []
both_BNBD_recall_mean = []
both_BNBD_recall_sd = []
both_BNBD_f1score_mean = []
both_BNBD_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -3) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        both_BNBD_pre_mean.append(np.mean(precision_values))
        both_BNBD_recall_mean.append(np.mean(recall_values))
        both_BNBD_f1score_mean.append(np.mean(f1score_values))
        both_BNBD_pre_sd.append(np.std(precision_values))
        both_BNBD_recall_sd.append(np.std(recall_values))
        both_BNBD_f1score_sd.append(np.std(f1score_values))

In [16]:
# Create the DataFrame with each column corresponding to the first element of each tuple from the pairs
df_table = pd.DataFrame({
    'Column1': [tup[0] for tup in zip(array_pair_1[0], array_pair_1[1])],
    'Column2': [tup[0] for tup in zip(array_pair_2[0], array_pair_2[1])],
    'Column3': [tup[0] for tup in zip(array_pair_3[0], array_pair_3[1])]
})

print(df_table)

NameError: name 'array_pair_1' is not defined

In [None]:

# Create the DataFrame with each column corresponding to the first element of each tuple from the pairs
df_table = pd.DataFrame({
    'Column1': [tup[0] for tup in zip(array_pair_1[0], array_pair_1[1])],
    'Column2': [tup[0] for tup in zip(array_pair_2[0], array_pair_2[1])],
    'Column3': [tup[0] for tup in zip(array_pair_3[0], array_pair_3[1])]
})

print(df_table)


# No Diri and PitmanYou prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_diri_Pit_pre_mean = []
no_diri_Pit_pre_sd = []
no_diri_Pit_recall_mean = []
no_diri_Pit_recall_sd = []
no_diri_Pit_f1score_mean = []
no_diri_Pit_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -4) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_diri_Pit_pre_mean.append(np.mean(precision_values))
        no_diri_Pit_recall_mean.append(np.mean(recall_values))
        no_diri_Pit_f1score_mean.append(np.mean(f1score_values))
        no_diri_Pit_pre_sd.append(np.std(precision_values))
        no_diri_Pit_recall_sd.append(np.std(recall_values))
        no_diri_Pit_f1score_sd.append(np.std(f1score_values))

# No Diri and Uniform prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_diri_Unif_pre_mean = []
no_diri_Unif_pre_sd = []
no_diri_Unif_recall_mean = []
no_diri_Unif_recall_sd = []
no_diri_Unif_f1score_mean = []
no_diri_Unif_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -5) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_diri_Unif_pre_mean.append(np.mean(precision_values))
        no_diri_Unif_recall_mean.append(np.mean(recall_values))
        no_diri_Unif_f1score_mean.append(np.mean(f1score_values))
        no_diri_Unif_pre_sd.append(np.std(precision_values))
        no_diri_Unif_recall_sd.append(np.std(recall_values))
        no_diri_Unif_f1score_sd.append(np.std(f1score_values))

# No Diri and Bounded NBD prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_diri_BNBD_pre_mean = []
no_diri_BNBD_pre_sd = []
no_diri_BNBD_recall_mean = []
no_diri_BNBD_recall_sd = []
no_diri_BNBD_f1score_mean = []
no_diri_BNBD_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -6) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_diri_BNBD_pre_mean.append(np.mean(precision_values))
        no_diri_BNBD_recall_mean.append(np.mean(recall_values))
        no_diri_BNBD_f1score_mean.append(np.mean(f1score_values))
        no_diri_BNBD_pre_sd.append(np.std(precision_values))
        no_diri_BNBD_recall_sd.append(np.std(recall_values))
        no_diri_BNBD_f1score_sd.append(np.std(f1score_values))

# both NO and PitmanYou prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_Pit_pre_mean = []
no_Pit_pre_sd = []
no_Pit_recall_mean = []
no_Pit_recall_sd = []
no_Pit_f1score_mean = []
no_Pit_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -7) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_Pit_pre_mean.append(np.mean(precision_values))
        no_Pit_recall_mean.append(np.mean(recall_values))
        no_Pit_f1score_mean.append(np.mean(f1score_values))
        no_Pit_pre_sd.append(np.std(precision_values))
        no_Pit_recall_sd.append(np.std(recall_values))
        no_Pit_f1score_sd.append(np.std(f1score_values))

# both NO and Uniform prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_Unif_pre_mean = []
no_Unif_pre_sd = []
no_Unif_recall_mean = []
no_Unif_recall_sd = []
no_Unif_f1score_mean = []
no_Unif_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -8) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_Unif_pre_mean.append(np.mean(precision_values))
        no_Unif_recall_mean.append(np.mean(recall_values))
        no_Unif_f1score_mean.append(np.mean(f1score_values))
        no_Unif_pre_sd.append(np.std(precision_values))
        no_Unif_recall_sd.append(np.std(recall_values))
        no_Unif_f1score_sd.append(np.std(f1score_values))

# both NO and Bounded NBD prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_BNBD_pre_mean = []
no_BNBD_pre_sd = []
no_BNBD_recall_mean = []
no_BNBD_recall_sd = []
no_BNBD_f1score_mean = []
no_BNBD_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -9) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_BNBD_pre_mean.append(np.mean(precision_values))
        no_BNBD_recall_mean.append(np.mean(recall_values))
        no_BNBD_f1score_mean.append(np.mean(f1score_values))
        no_BNBD_pre_sd.append(np.std(precision_values))
        no_BNBD_recall_sd.append(np.std(recall_values))
        no_BNBD_f1score_sd.append(np.std(f1score_values))

# No Empirical and PitmanYou prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_emp_Pit_pre_mean = []
no_emp_Pit_pre_sd = []
no_emp_Pit_recall_mean = []
no_emp_Pit_recall_sd = []
no_emp_Pit_f1score_mean = []
no_emp_Pit_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -10) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_emp_Pit_pre_mean.append(np.mean(precision_values))
        no_emp_Pit_recall_mean.append(np.mean(recall_values))
        no_emp_Pit_f1score_mean.append(np.mean(f1score_values))
        no_emp_Pit_pre_sd.append(np.std(precision_values))
        no_emp_Pit_recall_sd.append(np.std(recall_values))
        no_emp_Pit_f1score_sd.append(np.std(f1score_values))

# No Empirical and Uniform prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_emp_Unif_pre_mean = []
no_emp_Unif_pre_sd = []
no_emp_Unif_recall_mean = []
no_emp_Unif_recall_sd = []
no_emp_Unif_f1score_mean = []
no_emp_Unif_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -11) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_emp_Unif_pre_mean.append(np.mean(precision_values))
        no_emp_Unif_recall_mean.append(np.mean(recall_values))
        no_emp_Unif_f1score_mean.append(np.mean(f1score_values))
        no_emp_Unif_pre_sd.append(np.std(precision_values))
        no_emp_Unif_recall_sd.append(np.std(recall_values))
        no_emp_Unif_f1score_sd.append(np.std(f1score_values))

# No Empirical and Bounded NBD prior

In [None]:
dup = [0.1, 1, 8, 100]
dist = [0,1]
# the order is 
# L.L., M.L., H.L., VH.L., L.H., M.H., V.H., V.H. H.
no_emp_BNBD_pre_mean = []
no_emp_BNBD_pre_sd = []
no_emp_BNBD_recall_mean = []
no_emp_BNBD_recall_sd = []
no_emp_BNBD_f1score_mean = []
no_emp_BNBD_f1score_sd = []
for i in dist:
    for j in dup:
        filtered_df = df[
            (df['round'] == -12) &
            (df['split'].isna() | (df['split'] == -2.0)) &
            (df['duplicates'] == j) &
            (df['distortion'] == i)
        ]
        precision_values = filtered_df['m1'].values
        recall_values = filtered_df['m2'].values
        f1score_values = filtered_df['m3'].values
        no_emp_BNBD_pre_mean.append(np.mean(precision_values))
        no_emp_BNBD_recall_mean.append(np.mean(recall_values))
        no_emp_BNBD_f1score_mean.append(np.mean(f1score_values))
        no_emp_BNBD_pre_sd.append(np.std(precision_values))
        no_emp_BNBD_recall_sd.append(np.std(recall_values))
        no_emp_BNBD_f1score_sd.append(np.std(f1score_values))