In [109]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data20.csv", index_col=0)

# Show the empty DataFrame
data = df


In [110]:
# Replace NaN values with 0
data_filled = data.fillna(0)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data_filled.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,round,split,iter,replicates,duplicates,distortion,m1,m2,m3
0,-1,0.0,10k,0,8.0,1,1.000000,0.745803,0.854396
1,-1,0.0,10k,0,100.0,1,0.997500,0.802817,0.889632
2,-1,0.0,10k,0,0.1,0,0.967742,0.952381,0.960000
3,-1,0.0,10k,0,1.0,0,1.000000,0.919192,0.957895
4,-1,0.0,10k,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,-12,0.0,50k,_20,100.0,0,1.000000,0.921762,0.959288
3844,-12,0.0,50k,_20,0.1,0,0.983607,0.952381,0.967742
3845,-12,0.0,50k,_20,0.1,1,1.000000,0.766667,0.867924
3846,-12,0.0,50k,_20,8.0,1,0.997895,0.757794,0.861427


In [111]:
# Remove "_" from the 'replicates' column and convert to numeric if possible
data_filled['replicates'] = data_filled['replicates'].astype(str).str.replace("_", "")
data_filled['replicates'] = pd.to_numeric(data_filled['replicates'], errors='coerce').fillna(0)

# Make 'round' and 'split' values positive
data_filled['round'] = data_filled['round'].abs()
data_filled['split'] = data_filled['split'].abs()


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,round,split,iter,replicates,duplicates,distortion,precision,recall,f1score
0,1,0.0,10k,0,8.0,1,1.000000,0.745803,0.854396
1,1,0.0,10k,0,100.0,1,0.997500,0.802817,0.889632
2,1,0.0,10k,0,0.1,0,0.967742,0.952381,0.960000
3,1,0.0,10k,0,1.0,0,1.000000,0.919192,0.957895
4,1,0.0,10k,0,0.1,1,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,12,0.0,50k,20,100.0,0,1.000000,0.921762,0.959288
3844,12,0.0,50k,20,0.1,0,0.983607,0.952381,0.967742
3845,12,0.0,50k,20,0.1,1,1.000000,0.766667,0.867924
3846,12,0.0,50k,20,8.0,1,0.997895,0.757794,0.861427


In [112]:
# Define a function to map 'round' values to model names
def map_round_to_model(round_number):
    if 1 <= round_number <= 3:
        return "Both"
    elif 4 <= round_number <= 6:
        return "No Diri"
    elif 7 <= round_number <= 9:
        return "None"
    elif 10 <= round_number <= 12:
        return "No Empirical"
    else:
        return "Unknown"  # For any round numbers outside the specified ranges

# Apply the mapping function to the 'round' column to create the new 'model' column
data_renamed['model'] = data_renamed['round'].apply(map_round_to_model)



# Define a function to map 'round' values to prior categories
def map_round_to_prior(round_number):
    if round_number in [1, 4, 7, 10]:
        return "Pitman"
    elif round_number in [2, 5, 8, 11]:
        return "uniform"
    elif round_number in [3, 6, 9, 12]:
        return "Bounded NBD"
    else:
        return "Unknown"  # For any round numbers outside the specified or considered ranges

# Apply the mapping function to the 'round' column to create the new 'prior' column
data_renamed['prior'] = data_renamed['round'].apply(map_round_to_prior)

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)

def map_iter_to_num(iter):
    if iter == '10k':
        return 10000
    elif iter == '50k':
        return 50000
    else:
        return 0  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['num_iter'] = data_renamed['iter'].apply(map_iter_to_num)

# Select only the specified columns
data_selected_columns = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'replicates']]

# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter',
                           'replicates', 'precision', 'recall', 'f1score']]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
0,Both,Pitman,High,High,10000,0,1.000000,0.745803,0.854396
1,Both,Pitman,Very High,High,10000,0,0.997500,0.802817,0.889632
2,Both,Pitman,Low,Low,10000,0,0.967742,0.952381,0.960000
3,Both,Pitman,Medium,Low,10000,0,1.000000,0.919192,0.957895
4,Both,Pitman,Low,High,10000,0,1.000000,0.766667,0.867924
...,...,...,...,...,...,...,...,...,...
3843,No Empirical,Bounded NBD,Very High,Low,50000,20,1.000000,0.921762,0.959288
3844,No Empirical,Bounded NBD,Low,Low,50000,20,0.983607,0.952381,0.967742
3845,No Empirical,Bounded NBD,Low,High,50000,20,1.000000,0.766667,0.867924
3846,No Empirical,Bounded NBD,High,High,50000,20,0.997895,0.757794,0.861427


In [113]:
data_cleaned = data_final

# Finding duplicates and keeping them to show
duplicates_to_show = data_cleaned[data_cleaned.duplicated(subset=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'], keep='first')]

# Sorting the duplicates for better readability
duplicates_sorted = duplicates_to_show.sort_values(by=['model', 'prior', 'duplicates_level', 'distortion_level', 'num_iter', 'replicates'])

pd.set_option('display.max_rows', 10)

duplicates_sorted


Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
3525,No Empirical,Pitman,High,High,50000,0,1.0,0.756195,0.861174
3527,No Empirical,Pitman,High,Low,50000,0,1.0,0.942492,0.970395
3520,No Empirical,Pitman,Low,High,50000,0,1.0,0.75,0.857143
3524,No Empirical,Pitman,Low,Low,50000,0,0.983607,0.952381,0.967742
3521,No Empirical,Pitman,Medium,High,50000,0,0.99187,0.75154,0.85514
3523,No Empirical,Pitman,Medium,Low,50000,0,1.0,0.929293,0.963351
3522,No Empirical,Pitman,Very High,High,50000,0,0.997559,0.822267,0.901471
3526,No Empirical,Pitman,Very High,Low,50000,0,1.0,0.925049,0.961066


In [114]:
data_cleaned.drop_duplicates(inplace=True)
data_cleaned.reset_index(drop=True, inplace=True)
data_cleaned.to_csv('temp.csv')
data_cleaned.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.drop_duplicates(inplace=True)


(3840, 9)

In [115]:
data_cleaned_10k = data_cleaned[data_cleaned['num_iter'] == 10000]
data_cleaned_50k = data_cleaned[data_cleaned['num_iter'] == 50000]

In [116]:
data_cleaned_50k = data_cleaned_50k[((data_cleaned_50k["model"] == 'Both') & (data_cleaned_50k["distortion_level"] == "High")) |
                                    ((data_cleaned_50k["model"] == 'No Diri') & (data_cleaned_50k["distortion_level"] == "High") & (data_cleaned_50k["duplicates_level"] == "High")) | 
                                    ((data_cleaned_50k["model"] == 'No Empirical') & (data_cleaned_50k["distortion_level"] == "High") & (data_cleaned_50k["duplicates_level"] == "Medium")) |
                                    ((data_cleaned_50k["model"] == 'None') & (data_cleaned_50k["distortion_level"] == "High") & ((data_cleaned_50k["duplicates_level"] == "High") | (data_cleaned_50k["duplicates_level"] == "Medium")))]

data_cleaned_50k

Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
1922,Both,Pitman,Very High,High,50000,0,1.000000,0.721663,0.838333
1923,Both,Pitman,High,High,50000,0,1.000000,0.685851,0.813656
1926,Both,Pitman,Medium,High,50000,0,0.991667,0.733059,0.842975
1927,Both,Pitman,Low,High,50000,0,1.000000,0.750000,0.857143
1928,Both,Pitman,Very High,High,50000,2,1.000000,0.716969,0.835156
...,...,...,...,...,...,...,...,...,...
3803,No Empirical,Bounded NBD,Medium,High,50000,16,0.984456,0.780288,0.870561
3810,No Empirical,Bounded NBD,Medium,High,50000,17,0.994609,0.757700,0.860140
3816,No Empirical,Bounded NBD,Medium,High,50000,18,0.991758,0.741273,0.848414
3824,No Empirical,Bounded NBD,Medium,High,50000,19,0.981959,0.782341,0.870857


In [117]:
data_cleaned_10k = data_cleaned_10k[~(((data_cleaned_10k["model"] == 'Both') & (data_cleaned_10k["distortion_level"] == "High")) |
                                    ((data_cleaned_10k["model"] == 'No Diri') & (data_cleaned_10k["distortion_level"] == "High") & (data_cleaned_10k["duplicates_level"] == "High")) | 
                                    ((data_cleaned_10k["model"] == 'No Empirical') & (data_cleaned_10k["distortion_level"] == "High") & (data_cleaned_10k["duplicates_level"] == "Medium")) |
                                    ((data_cleaned_10k["model"] == 'None') & (data_cleaned_10k["distortion_level"] == "High") & ((data_cleaned_10k["duplicates_level"] == "High") | (data_cleaned_10k["duplicates_level"] == "Medium"))))]

data_cleaned_50k

Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
1922,Both,Pitman,Very High,High,50000,0,1.000000,0.721663,0.838333
1923,Both,Pitman,High,High,50000,0,1.000000,0.685851,0.813656
1926,Both,Pitman,Medium,High,50000,0,0.991667,0.733059,0.842975
1927,Both,Pitman,Low,High,50000,0,1.000000,0.750000,0.857143
1928,Both,Pitman,Very High,High,50000,2,1.000000,0.716969,0.835156
...,...,...,...,...,...,...,...,...,...
3803,No Empirical,Bounded NBD,Medium,High,50000,16,0.984456,0.780288,0.870561
3810,No Empirical,Bounded NBD,Medium,High,50000,17,0.994609,0.757700,0.860140
3816,No Empirical,Bounded NBD,Medium,High,50000,18,0.991758,0.741273,0.848414
3824,No Empirical,Bounded NBD,Medium,High,50000,19,0.981959,0.782341,0.870857


In [118]:
data_cleaned_20 = pd.concat([data_cleaned_10k, data_cleaned_50k])
data_cleaned_20.reset_index(drop=True, inplace=True)
data_cleaned_20.to_csv("clean_data20.csv")

Unnamed: 0,model,prior,duplicates_level,distortion_level,num_iter,replicates,precision,recall,f1score
2,Both,Pitman,Low,Low,10000,0,0.967742,0.952381,0.960000
3,Both,Pitman,Medium,Low,10000,0,1.000000,0.919192,0.957895
6,Both,Pitman,High,Low,10000,0,1.000000,0.940895,0.969547
7,Both,Pitman,Very High,Low,10000,0,1.000000,0.921762,0.959288
10,Both,Pitman,High,Low,10000,2,1.000000,0.935303,0.966570
...,...,...,...,...,...,...,...,...,...
3803,No Empirical,Bounded NBD,Medium,High,50000,16,0.984456,0.780288,0.870561
3810,No Empirical,Bounded NBD,Medium,High,50000,17,0.994609,0.757700,0.860140
3816,No Empirical,Bounded NBD,Medium,High,50000,18,0.991758,0.741273,0.848414
3824,No Empirical,Bounded NBD,Medium,High,50000,19,0.981959,0.782341,0.870857
