In [107]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data.csv", index_col=0)

# Show the empty DataFrame
data = df


In [108]:
# Replace NaN values with 0
data_filled = data.fillna(0)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data_filled.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,round,split,iter,replicates,duplicates,distortion,m1,m2,m3
0,-1,0.0,10k,0,8.0,1,1.000000,0.745803,0.854396
1,-1,0.0,10k,0,100.0,1,0.997500,0.802817,0.889632
2,-1,0.0,10k,0,0.1,0,0.967742,0.952381,0.960000
3,-1,0.0,10k,0,1.0,0,1.000000,0.919192,0.957895
4,-1,0.0,10k,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,-12,0.0,50k,_20,100.0,0,1.000000,0.921762,0.959288
3844,-12,0.0,50k,_20,0.1,0,0.983607,0.952381,0.967742
3845,-12,0.0,50k,_20,0.1,1,1.000000,0.766667,0.867924
3846,-12,0.0,50k,_20,8.0,1,0.997895,0.757794,0.861427


In [109]:
# Remove "_" from the 'replicates' column and convert to numeric if possible
data_filled['replicates'] = data_filled['replicates'].astype(str).str.replace("_", "")
data_filled['replicates'] = pd.to_numeric(data_filled['replicates'], errors='coerce').fillna(0)

# Make 'round' and 'split' values positive
data_filled['round'] = data_filled['round'].abs()
data_filled['split'] = data_filled['split'].abs()


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,round,split,iter,replicates,duplicates,distortion,precision,recall,f1score
0,1,0.0,10k,0,8.0,1,1.000000,0.745803,0.854396
1,1,0.0,10k,0,100.0,1,0.997500,0.802817,0.889632
2,1,0.0,10k,0,0.1,0,0.967742,0.952381,0.960000
3,1,0.0,10k,0,1.0,0,1.000000,0.919192,0.957895
4,1,0.0,10k,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,12,0.0,50k,20,100.0,0,1.000000,0.921762,0.959288
3844,12,0.0,50k,20,0.1,0,0.983607,0.952381,0.967742
3845,12,0.0,50k,20,0.1,1,1.000000,0.766667,0.867924
3846,12,0.0,50k,20,8.0,1,0.997895,0.757794,0.861427


In [110]:
# Define a function to map 'round' values to model names
def map_round_to_model(round_number):
    if 1 <= round_number <= 3:
        return "Both"
    elif 4 <= round_number <= 6:
        return "No Diri"
    elif 7 <= round_number <= 9:
        return "None"
    elif 10 <= round_number <= 12:
        return "No Empirical"
    else:
        return "Unknown"  # For any round numbers outside the specified ranges

# Apply the mapping function to the 'round' column to create the new 'model' column
data_renamed['model'] = data_renamed['round'].apply(map_round_to_model)



# Define a function to map 'round' values to prior categories
def map_round_to_prior(round_number):
    if round_number in [1, 4, 7, 10]:
        return "Pitman"
    elif round_number in [2, 5, 8, 11]:
        return "uniform"
    elif round_number in [3, 6, 9, 12]:
        return "Bounded NBD"
    else:
        return "Unknown"  # For any round numbers outside the specified or considered ranges

# Apply the mapping function to the 'round' column to create the new 'prior' column
data_renamed['prior'] = data_renamed['round'].apply(map_round_to_prior)

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)

def map_iter_to_num(iter):
    if iter == '10k':
        return 10000
    elif iter == '50k':
        return 50000
    else:
        return 0  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['num_iter'] = data_renamed['iter'].apply(map_iter_to_num)

# Select only the specified columns
data_selected_columns = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates']]

# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter',
                           'replicates', 'precision', 'recall', 'f1score']]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
0,Both,Pitman,High,High,10000,0,1.000000,0.745803,0.854396
1,Both,Pitman,Very High,High,10000,0,0.997500,0.802817,0.889632
2,Both,Pitman,Low,Low,10000,0,0.967742,0.952381,0.960000
3,Both,Pitman,Medium,Low,10000,0,1.000000,0.919192,0.957895
4,Both,Pitman,Low,High,10000,0,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,No Empirical,Bounded NBD,Very High,Low,50000,20,1.000000,0.921762,0.959288
3844,No Empirical,Bounded NBD,Low,Low,50000,20,0.983607,0.952381,0.967742
3845,No Empirical,Bounded NBD,Low,High,50000,20,1.000000,0.766667,0.867924
3846,No Empirical,Bounded NBD,High,High,50000,20,0.997895,0.757794,0.861427


In [111]:
data_cleaned = data_final

# Finding duplicates and keeping them to show
duplicates_to_show = data_cleaned[data_cleaned.duplicated(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'], keep='first')]

# Sorting the duplicates for better readability
duplicates_sorted = duplicates_to_show.sort_values(by=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'])

pd.set_option('display.max_rows', None)

duplicates_sorted


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
3525,No Empirical,Pitman,High,High,50000,0,1.0,0.756195,0.861174
3527,No Empirical,Pitman,High,Low,50000,0,1.0,0.942492,0.970395
3520,No Empirical,Pitman,Low,High,50000,0,1.0,0.75,0.857143
3524,No Empirical,Pitman,Low,Low,50000,0,0.983607,0.952381,0.967742
3521,No Empirical,Pitman,Medium,High,50000,0,0.99187,0.75154,0.85514
3523,No Empirical,Pitman,Medium,Low,50000,0,1.0,0.929293,0.963351
3522,No Empirical,Pitman,Very High,High,50000,0,0.997559,0.822267,0.901471
3526,No Empirical,Pitman,Very High,Low,50000,0,1.0,0.925049,0.961066


In [112]:
data_cleaned.drop_duplicates(inplace=True)
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.drop_duplicates(inplace=True)


(3840, 9)

In [113]:
# Checking for missing combinations in the first 6 columns
# First, we create a DataFrame of all possible combinations of these columns

# Get unique values for each of the first 6 columns
model_values = data_cleaned['model'].unique()
prior_values = data_cleaned['prior'].unique()
duplicates_level_values = data_cleaned['duplicates_level'].unique()
distortion_level_values = data_cleaned['distortion_level'].unique()
num_iter_values = data_cleaned['num_iter'].unique()
replicates_values = data_cleaned['replicates'].unique()

# Use itertools.product to create all possible combinations
from itertools import product
all_combinations = list(product(model_values, prior_values, duplicates_level_values, distortion_level_values, num_iter_values, replicates_values))

# Convert to DataFrame
all_combinations_df = pd.DataFrame(all_combinations, columns=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'])

# Now, find missing combinations by checking which rows in all_combinations_df are not present in data_cleaned_no_duplicates
missing_combinations = pd.merge(all_combinations_df, data_cleaned, on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'], how='left', indicator=True).query('_merge == "left_only"').drop(columns=['precision', 'recall', 'f1score', '_merge'])

missing_combinations_count = missing_combinations.shape[0]

print(missing_combinations_count)
pd.set_option('display.max_rows', None)
missing_combinations


0


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates


In [114]:
pd.set_option('display.max_rows', 10)

In [115]:
# Group by the specified columns and compute mean and standard deviation for precision, recall, and f1score
grouped_data = data_final.groupby(['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])

# Compute mean
mean_values = grouped_data[['precision', 'recall', 'f1score']].mean().reset_index()

# Compute standard deviation
std_dev_values = grouped_data[['precision', 'recall', 'f1score']].std().reset_index()

# Rename columns for clarity
mean_values.rename(columns={
    'precision': 'precision_mean', 
    'recall': 'recall_mean', 
    'f1score': 'f1score_mean'}, inplace=True)

std_dev_values.rename(columns={
    'precision': 'precision_std', 
    'recall': 'recall_std', 
    'f1score': 'f1score_std'}, inplace=True)

# Display the mean values
mean_values.head(), std_dev_values.head()


(  model        prior duplicates_level distortion_level  num_iter  \
 0  Both  Bounded NBD             High             High     10000   
 1  Both  Bounded NBD             High             High     50000   
 2  Both  Bounded NBD             High              Low     10000   
 3  Both  Bounded NBD             High              Low     50000   
 4  Both  Bounded NBD              Low             High     10000   
 
    precision_mean  recall_mean  f1score_mean  
 0        0.998371     0.751479      0.857191  
 1        0.999559     0.708233      0.828795  
 2        0.999872     0.935863      0.966809  
 3        1.000000     0.940335      0.969249  
 4        1.000000     0.750833      0.857600  ,
   model        prior duplicates_level distortion_level  num_iter  \
 0  Both  Bounded NBD             High             High     10000   
 1  Both  Bounded NBD             High             High     50000   
 2  Both  Bounded NBD             High              Low     10000   
 3  Both  Bounded N

In [116]:
mean_values.shape

(192, 8)

In [117]:
# Merge mean and standard deviation dataframes on the groupby columns
combined_data = pd.merge(mean_values, std_dev_values, 
                         on=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter'])


pd.set_option('display.max_rows', None)


# Display the combined dataframe
combined_data


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,precision_mean,recall_mean,f1score_mean,precision_std,recall_std,f1score_std
0,Both,Bounded NBD,High,High,10000,0.998371,0.751479,0.857191,0.001466,0.029411,0.019271
1,Both,Bounded NBD,High,High,50000,0.999559,0.708233,0.828795,0.000905,0.025848,0.017251
2,Both,Bounded NBD,High,Low,10000,0.999872,0.935863,0.966809,0.00057,0.001298,0.000694
3,Both,Bounded NBD,High,Low,50000,1.0,0.940335,0.969249,0.0,0.002042,0.001085
4,Both,Bounded NBD,Low,High,10000,1.0,0.750833,0.8576,0.0,0.015742,0.010213
5,Both,Bounded NBD,Low,High,50000,1.0,0.741667,0.851648,0.0,0.00855,0.005637
6,Both,Bounded NBD,Low,Low,10000,0.984426,0.952381,0.968135,0.003666,0.0,0.001759
7,Both,Bounded NBD,Low,Low,50000,0.980434,0.952381,0.966194,0.006511,0.0,0.003177
8,Both,Bounded NBD,Medium,High,10000,0.985199,0.760986,0.858368,0.006859,0.026884,0.015021
9,Both,Bounded NBD,Medium,High,50000,0.990093,0.738501,0.845984,0.00136,0.002848,0.00179


In [118]:
pd.set_option('display.max_rows', 10)

In [119]:
combined_data.to_csv("clean_data.csv")