In [9]:
import pandas as pd
import matplotlib.pyplot as plt

data = 'data_10k_50k.csv'

data = pd.read_csv(data, index_col=0)

pd.set_option('display.max_rows', 10)

data = data.drop(180)
data = data.reset_index(drop = True)

# Display the first few rows of the dataframe to confirm the index is set correctly
data



Unnamed: 0,round,split,duplicates,distortion,precision,recall,f1score
0,-1,,0.1,0,0.980198,0.951923,0.965854
1,-1,,1.0,1,0.981529,0.712766,0.825831
2,-1,,1.0,0,0.994200,0.956299,0.974881
3,-1,,8.0,0,1.000000,0.947767,0.973183
4,-1,,100.0,0,1.000000,0.952495,0.975670
...,...,...,...,...,...,...,...
187,-12,-2.0,1.0,1,0.974423,0.722479,0.829748
188,-12,-2.0,0.1,1,0.950000,0.727273,0.823848
189,-12,-2.0,8.0,0,1.000000,0.951098,0.974936
190,-12,-2.0,1.0,0,0.996141,0.960019,0.977746


In [10]:
# Replace NaN values with 0
data_filled = data.fillna(0)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data_filled.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled['iter'] = '10k'
data_filled.loc[data_filled['split'] == 2, 'iter'] = '50k'
data_filled


Unnamed: 0,round,split,duplicates,distortion,precision,recall,f1score,iter
0,-1,0.0,0.1,0,0.980198,0.951923,0.965854,10k
1,-1,0.0,1.0,1,0.981529,0.712766,0.825831,10k
2,-1,0.0,1.0,0,0.994200,0.956299,0.974881,10k
3,-1,0.0,8.0,0,1.000000,0.947767,0.973183,10k
4,-1,0.0,100.0,0,1.000000,0.952495,0.975670,10k
...,...,...,...,...,...,...,...,...
187,-12,-2.0,1.0,1,0.974423,0.722479,0.829748,10k
188,-12,-2.0,0.1,1,0.950000,0.727273,0.823848,10k
189,-12,-2.0,8.0,0,1.000000,0.951098,0.974936,10k
190,-12,-2.0,1.0,0,0.996141,0.960019,0.977746,10k


In [11]:


# Make 'round' and 'split' values positive
data_filled['round'] = data_filled['round'].abs()
data_filled['split'] = data_filled['split'].abs()


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,round,split,duplicates,distortion,precision,recall,f1score,iter
0,1,0.0,0.1,0,0.980198,0.951923,0.965854,10k
1,1,0.0,1.0,1,0.981529,0.712766,0.825831,10k
2,1,0.0,1.0,0,0.994200,0.956299,0.974881,10k
3,1,0.0,8.0,0,1.000000,0.947767,0.973183,10k
4,1,0.0,100.0,0,1.000000,0.952495,0.975670,10k
...,...,...,...,...,...,...,...,...
187,12,2.0,1.0,1,0.974423,0.722479,0.829748,10k
188,12,2.0,0.1,1,0.950000,0.727273,0.823848,10k
189,12,2.0,8.0,0,1.000000,0.951098,0.974936,10k
190,12,2.0,1.0,0,0.996141,0.960019,0.977746,10k


In [12]:
# Define a function to map 'round' values to model names
def map_round_to_model(round_number):
    if 1 <= round_number <= 3:
        return "Both"
    elif 4 <= round_number <= 6:
        return "No Diri"
    elif 7 <= round_number <= 9:
        return "None"
    elif 10 <= round_number <= 12:
        return "No Empirical"
    else:
        return "Unknown"  # For any round numbers outside the specified ranges

# Apply the mapping function to the 'round' column to create the new 'model' column
data_renamed['model'] = data_renamed['round'].apply(map_round_to_model)



# Define a function to map 'round' values to prior categories
def map_round_to_prior(round_number):
    if round_number in [1, 4, 7, 10]:
        return "Pitman"
    elif round_number in [2, 5, 8, 11]:
        return "uniform"
    elif round_number in [3, 6, 9, 12]:
        return "Bounded NBD"
    else:
        return "Unknown"  # For any round numbers outside the specified or considered ranges

# Apply the mapping function to the 'round' column to create the new 'prior' column
data_renamed['prior'] = data_renamed['round'].apply(map_round_to_prior)

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)

def map_iter_to_num(iter):
    if iter == '10k':
        return 10000
    elif iter == '50k':
        return 50000
    else:
        return 0  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['num_iter'] = data_renamed['iter'].apply(map_iter_to_num)

# Select only the specified columns
data_selected_columns = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level']]

# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter',
                        'precision', 'recall', 'f1score']]

# Display the first few rows to confirm the changes
data_final






KeyError: "['replicates'] not in index"

In [None]:
data_cleaned = data_final

# Finding duplicates and keeping them to show
duplicates_to_show = data_cleaned[data_cleaned.duplicated(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'], keep='first')]

# Sorting the duplicates for better readability
duplicates_sorted = duplicates_to_show.sort_values(by=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

pd.set_option('display.max_rows', None)

duplicates_sorted


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
3525,No Empirical,Pitman,High,High,50000,0,1.0,0.756195,0.861174
3527,No Empirical,Pitman,High,Low,50000,0,1.0,0.942492,0.970395
3520,No Empirical,Pitman,Low,High,50000,0,1.0,0.75,0.857143
3524,No Empirical,Pitman,Low,Low,50000,0,0.983607,0.952381,0.967742
3521,No Empirical,Pitman,Medium,High,50000,0,0.99187,0.75154,0.85514
3523,No Empirical,Pitman,Medium,Low,50000,0,1.0,0.929293,0.963351
3522,No Empirical,Pitman,Very High,High,50000,0,0.997559,0.822267,0.901471
3526,No Empirical,Pitman,Very High,Low,50000,0,1.0,0.925049,0.961066


In [None]:
data_cleaned.drop_duplicates(inplace=True)
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.drop_duplicates(inplace=True)


(3840, 9)

In [None]:
# Checking for missing combinations in the first 6 columns
# First, we create a DataFrame of all possible combinations of these columns

# Get unique values for each of the first 6 columns
model_values = data_cleaned['model'].unique()
prior_values = data_cleaned['prior'].unique()
duplicates_level_values = data_cleaned['duplicates_level'].unique()
distortion_level_values = data_cleaned['distortion_level'].unique()
num_iter_values = data_cleaned['num_iter'].unique()

# Use itertools.product to create all possible combinations
from itertools import product
all_combinations = list(product(model_values, prior_values, duplicates_level_values, distortion_level_values, num_iter_values))

# Convert to DataFrame
all_combinations_df = pd.DataFrame(all_combinations, columns=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

# Now, find missing combinations by checking which rows in all_combinations_df are not present in data_cleaned_no_duplicates
missing_combinations = pd.merge(all_combinations_df, data_cleaned, on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'], how='left', indicator=True).query('_merge == "left_only"').drop(columns=['precision', 'recall', 'f1score', '_merge'])

missing_combinations_count = missing_combinations.shape[0]

print(missing_combinations_count)
pd.set_option('display.max_rows', None)
missing_combinations


0


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates


In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
grouped_data.head()

Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
0,Both,Pitman,High,High,10000,0,1.000000,0.745803,0.854396
1,Both,Pitman,Very High,High,10000,0,0.997500,0.802817,0.889632
2,Both,Pitman,Low,Low,10000,0,0.967742,0.952381,0.960000
3,Both,Pitman,Medium,Low,10000,0,1.000000,0.919192,0.957895
4,Both,Pitman,Low,High,10000,0,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3715,No Empirical,Bounded NBD,Very High,Low,50000,5,1.000000,0.921762,0.959288
3716,No Empirical,Bounded NBD,Low,High,50000,5,1.000000,0.783333,0.878505
3717,No Empirical,Bounded NBD,High,High,50000,5,0.997888,0.755396,0.859873
3718,No Empirical,Bounded NBD,Low,Low,50000,5,0.967742,0.952381,0.960000


In [None]:
# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_final.groupby(['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

# Compute mean
mean_values = grouped_data[['precision', 'recall', 'f1score']].mean().reset_index()

# Compute median
median_values = grouped_data[['precision', 'recall', 'f1score']].median().reset_index()

# Compute lowe bound
# Calculate the lower and upper bounds of the 95% credible interval
bounds = grouped_data['precision', 'recall', 'f1score'].quantile([0.025, 0.975]).unstack(level=-1)

# Now you have the bounds, we will need to flatten the MultiIndex for easy handling
# If you are using pandas version >= 1.0.0
bounds.columns = ['_'.join(col).strip() for col in bounds.columns.values]

# For pandas version < 1.0.0, the above may not work, use the following instead:
# bounds.columns = ['_'.join(col).rstrip('_') for col in bounds.columns.values]

# Now let's rename the columns to reflect what they represent
bounds.rename(columns={
    'precision_0.025': 'precision_lower_bound',
    'precision_0.975': 'precision_upper_bound',
    'recall_0.025': 'recall_lower_bound',
    'recall_0.975': 'recall_upper_bound',
    'f1score_0.025': 'f1score_lower_bound',
    'f1score_0.975': 'f1score_upper_bound'
}, inplace=True)

# Reset index to turn the groupby keys into columns
bounds.reset_index(inplace=True)

bounds.head()
# Coumpute upper bound


# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
mean_values.rename(columns={
    'precision': 'precision_mean', 
    'recall': 'recall_mean', 
    'f1score': 'f1score_mean'}, inplace=True)

median_values.rename(columns={
'precision': 'precision_median', 
'recall': 'recall_median', 
'f1score': 'f1score_median'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
mean_values.head(), median_values.head(), std_dev_values.head()


(  model        prior duplicates_level distortion_level  num_iter  \
 0  Both  Bounded NBD             High             High     10000   
 1  Both  Bounded NBD             High             High     50000   
 2  Both  Bounded NBD             High              Low     10000   
 3  Both  Bounded NBD             High              Low     50000   
 4  Both  Bounded NBD              Low             High     10000   
 
    precision_mean  recall_mean  f1score_mean  
 0        0.998371     0.751479      0.857191  
 1        0.999559     0.708233      0.828795  
 2        0.999872     0.935863      0.966809  
 3        1.000000     0.940335      0.969249  
 4        1.000000     0.750833      0.857600  ,
   model        prior duplicates_level distortion_level  num_iter  \
 0  Both  Bounded NBD             High             High     10000   
 1  Both  Bounded NBD             High             High     50000   
 2  Both  Bounded NBD             High              Low     10000   
 3  Both  Bounded N

In [None]:
# Group by the specified columns
grouped_data = data_final.groupby(['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

# Define the metrics you want to calculate the bounds for
metrics = ['precision', 'recall', 'f1score']

# Initialize an empty DataFrame to store the bounds
bounds_df = pd.DataFrame()

# Calculate the lower and upper bounds for each metric
for metric in metrics:
    # Calculate the quantiles for the current metric
    bounds = grouped_data[metric].quantile([0.025, 0.975]).unstack(level=-1)
    
    # Rename columns to reflect the metric and bound type
    bounds.columns = [f'{metric}_lower_bound', f'{metric}_upper_bound']
    
    # If bounds_df is empty, initialize it with the current bounds
    if bounds_df.empty:
        bounds_df = bounds.reset_index()
    else:
        # Otherwise, merge the new bounds into the existing DataFrame
        bounds_df = bounds_df.merge(bounds.reset_index(), on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

# Now you have a DataFrame with lower and upper bounds for each metric
print(bounds_df.head())



  model        prior duplicates_level distortion_level  num_iter  \
0  Both  Bounded NBD             High             High     10000   
1  Both  Bounded NBD             High             High     50000   
2  Both  Bounded NBD             High              Low     10000   
3  Both  Bounded NBD             High              Low     50000   
4  Both  Bounded NBD              Low             High     10000   

   precision_lower_bound  precision_upper_bound  recall_lower_bound  \
0               0.995247                    1.0            0.696962   
1               0.997708                    1.0            0.689808   
2               0.998661                    1.0            0.933706   
3               1.000000                    1.0            0.936901   
4               1.000000                    1.0            0.733333   

   recall_upper_bound  f1score_lower_bound  f1score_upper_bound  
0            0.795564             0.821318             0.885764  
1            0.760652           

In [None]:
mean_values.shape

(192, 8)

In [None]:
median_values.shape

(192, 8)

In [None]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data_median = pd.merge(median_values, std_dev_values, 
                         on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])


pd.set_option('display.max_rows', 20)


# Display the combined dataframe
combined_data_median


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,precision_median,recall_median,f1score_median,precision_std,recall_std,f1score_std
0,Both,Bounded NBD,High,High,10000,0.997928,0.756994,0.860777,0.001466,0.029411,0.019271
1,Both,Bounded NBD,High,High,50000,1.000000,0.696643,0.820924,0.000905,0.025848,0.017251
2,Both,Bounded NBD,High,Low,10000,1.000000,0.935303,0.966570,0.000570,0.001298,0.000694
3,Both,Bounded NBD,High,Low,50000,1.000000,0.940495,0.969335,0.000000,0.002042,0.001085
4,Both,Bounded NBD,Low,High,10000,1.000000,0.750000,0.857143,0.000000,0.015742,0.010213
...,...,...,...,...,...,...,...,...,...,...,...
187,,uniform,Medium,Low,50000,1.000000,0.924242,0.960630,0.000799,0.004298,0.002397
188,,uniform,Very High,High,10000,1.000000,0.827297,0.905487,0.000566,0.004131,0.002543
189,,uniform,Very High,High,50000,1.000000,0.834675,0.909888,0.000000,0.002605,0.001545
190,,uniform,Very High,Low,10000,1.000000,0.921762,0.959288,0.000000,0.000917,0.000496


In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
combined_data.to_csv("clean_data.csv")

In [None]:
combined_data_median.to_csv("clean_data_median.csv")

In [None]:
# Assuming that 'combined_data_median' is your main DataFrame and 'bounds_df' is the DataFrame with the bounds
# Merge the two DataFrames on the common columns
merged_data = pd.merge(combined_data_median, bounds_df, 
                       on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'], 
                       how='left')

# Display the merged DataFrame
print(merged_data.head())

  model        prior duplicates_level distortion_level  num_iter  \
0  Both  Bounded NBD             High             High     10000   
1  Both  Bounded NBD             High             High     50000   
2  Both  Bounded NBD             High              Low     10000   
3  Both  Bounded NBD             High              Low     50000   
4  Both  Bounded NBD              Low             High     10000   

   precision_median  recall_median  f1score_median  precision_std  recall_std  \
0          0.997928       0.756994        0.860777       0.001466    0.029411   
1          1.000000       0.696643        0.820924       0.000905    0.025848   
2          1.000000       0.935303        0.966570       0.000570    0.001298   
3          1.000000       0.940495        0.969335       0.000000    0.002042   
4          1.000000       0.750000        0.857143       0.000000    0.015742   

   f1score_std  precision_lower_bound  precision_upper_bound  \
0     0.019271               0.995247   

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
merged_data

Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,precision_median,recall_median,f1score_median,precision_std,recall_std,f1score_std,precision_lower_bound,precision_upper_bound,recall_lower_bound,recall_upper_bound,f1score_lower_bound,f1score_upper_bound
0,Both,Bounded NBD,High,High,10000,0.997928,0.756994,0.860777,0.001466,0.029411,0.019271,0.995247,1.0,0.696962,0.795564,0.821318,0.885764
1,Both,Bounded NBD,High,High,50000,1.000000,0.696643,0.820924,0.000905,0.025848,0.017251,0.997708,1.0,0.689808,0.760652,0.816433,0.864056
2,Both,Bounded NBD,High,Low,10000,1.000000,0.935303,0.966570,0.000570,0.001298,0.000694,0.998661,1.0,0.933706,0.937740,0.965717,0.967869
3,Both,Bounded NBD,High,Low,50000,1.000000,0.940495,0.969335,0.000000,0.002042,0.001085,1.000000,1.0,0.936901,0.943750,0.967423,0.971061
4,Both,Bounded NBD,Low,High,10000,1.000000,0.750000,0.857143,0.000000,0.015742,0.010213,1.000000,1.0,0.733333,0.783333,0.846154,0.878505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,,uniform,Medium,Low,50000,1.000000,0.924242,0.960630,0.000799,0.004298,0.002397,0.997812,1.0,0.919192,0.931414,0.957366,0.964488
188,,uniform,Very High,High,10000,1.000000,0.827297,0.905487,0.000566,0.004131,0.002543,0.998334,1.0,0.821211,0.834054,0.901548,0.909519
189,,uniform,Very High,High,50000,1.000000,0.834675,0.909888,0.000000,0.002605,0.001545,1.000000,1.0,0.831942,0.840728,0.908262,0.913473
190,,uniform,Very High,Low,10000,1.000000,0.921762,0.959288,0.000000,0.000917,0.000496,1.000000,1.0,0.920726,0.923734,0.958727,0.960355


In [None]:
merged_data.to_csv("merged_data_median_bounds.csv")