In [253]:
import pandas as pd

pd.set_option('display.max_rows', 10)

# Create an empty DataFrame with the specified columns
df = pd.read_csv("data.csv", index_col=0)

# Show the empty DataFrame
data = df

data


Unnamed: 0,model,cluster_size,num_iter,duplicates,distortion,m1,m2,m3,date
0,Both,2,10000,8.0,0,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
1,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_11_53_29_2024
2,Both,2,10000,0.1,0,0.982456,0.888889,0.933333,Wed_Mar__6_11_21_54_2024
3,Both,2,10000,8.0,0,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
4,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_12_56_53_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,1.0,1,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,1.0,1,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,1.0,1,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,1.0,1,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [254]:
data.fillna("Neither", inplace=True)

# Ensure all columns except 'replicates' (since it might contain non-numeric values originally) are numeric
data_filled = data.apply(pd.to_numeric, errors='ignore')

# Display the data types to confirm changes
data_filled.dtypes, data_filled.head()

data_filled


Unnamed: 0,model,cluster_size,num_iter,duplicates,distortion,m1,m2,m3,date
0,Both,2,10000,8.0,0,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
1,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_11_53_29_2024
2,Both,2,10000,0.1,0,0.982456,0.888889,0.933333,Wed_Mar__6_11_21_54_2024
3,Both,2,10000,8.0,0,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
4,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_12_56_53_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,1.0,1,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,1.0,1,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,1.0,1,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,1.0,1,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [255]:


# Rename columns 'm1', 'm2', 'm3' to 'precision', 'recall', 'f1score'
data_renamed = data_filled.rename(columns={'m1': 'precision', 'm2': 'recall', 'm3': 'f1score'})

# Display the first few rows to confirm the column names have been updated
data_renamed


Unnamed: 0,model,cluster_size,num_iter,duplicates,distortion,precision,recall,f1score,date
0,Both,2,10000,8.0,0,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
1,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_11_53_29_2024
2,Both,2,10000,0.1,0,0.982456,0.888889,0.933333,Wed_Mar__6_11_21_54_2024
3,Both,2,10000,8.0,0,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
4,Both,2,10000,1.0,0,1.000000,0.535354,0.697368,Wed_Mar__6_12_56_53_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,1.0,1,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,1.0,1,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,1.0,1,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,1.0,1,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [256]:

# Define a function to map 'duplicates' values to duplicates level categories
def map_duplicates_to_level(duplicates_value):
    if duplicates_value == 0.1:
        return "Low"
    elif duplicates_value == 1.0:
        return "Medium"
    elif duplicates_value == 8.0:
        return "High"
    elif duplicates_value == 100.0:
        return "Very High"
    else:
        return "Unknown"  # For any duplicates values outside the specified ranges

# Apply the mapping function to the 'duplicates' column to create the new 'duplicates_level' column
data_renamed['duplicates_level'] = data_renamed['duplicates'].apply(map_duplicates_to_level)

# Define a function to map 'distortion' values to distortion level categories
def map_distortion_to_level(distortion_value):
    if distortion_value == 0:
        return "Low"
    elif distortion_value == 1:
        return "High"
    else:
        return "Unknown"  # For any distortion values outside the specified values

# Apply the mapping function to the 'distortion' column to create the new 'distortion_level' column
data_renamed['distortion_level'] = data_renamed['distortion'].apply(map_distortion_to_level)


# Select and reorder the dataset with only the specified columns
data_final = data_renamed[['model', 'cluster_size', 'num_iter', 'duplicates_level', 'distortion_level',
                            'precision', 'recall', 'f1score', "date"]]

# Display the first few rows to confirm the changes
data_final






Unnamed: 0,model,cluster_size,num_iter,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,2,10000,High,Low,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
1,Both,2,10000,Medium,Low,1.000000,0.535354,0.697368,Wed_Mar__6_11_53_29_2024
2,Both,2,10000,Low,Low,0.982456,0.888889,0.933333,Wed_Mar__6_11_21_54_2024
3,Both,2,10000,High,Low,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
4,Both,2,10000,Medium,Low,1.000000,0.535354,0.697368,Wed_Mar__6_12_56_53_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,Medium,High,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,Medium,High,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,Medium,High,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,Medium,High,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [257]:
pd.set_option("display.max_rows", 10)

In [258]:
data_final.value_counts(subset=['model', 'cluster_size', 'num_iter', 'duplicates_level', 'distortion_level'])

model    cluster_size  num_iter  duplicates_level  distortion_level
No Diri  2             50000     High              High                58
Both     10            10000     Very High         Low                 56
                                 Medium            Low                 56
                                 Low               Low                 56
                                 High              Low                 56
                                                                       ..
                                                   High                 1
         2             10000     Very High         High                 1
                                 Medium            High                 1
                                 Low               High                 1
                                 High              High                 1
Name: count, Length: 79, dtype: int64

In [259]:
# Specify the subset of columns you want to consider for finding unique rows
subset_columns = ['model', 'cluster_size', 'num_iter', 'duplicates_level', 'distortion_level']

# Find duplicates across the specified subset of columns, marking all duplicates as True
duplicates = data_final.duplicated(subset=subset_columns, keep=False)

# Invert the boolean Series to identify unique rows
unique_rows = ~duplicates

# Filter out the unique rows, keeping only those that have duplicates in the subset
data_final = data_final[~unique_rows]



In [260]:
pd.set_option("display.max_rows", None)
data_final = data_final.groupby(subset_columns, as_index=False, group_keys=False).apply(lambda x: x.head(50))
data_final.value_counts(subset=['model', 'cluster_size', 'num_iter', 'duplicates_level', 'distortion_level'])
data_final.shape

(3600, 9)

In [261]:
pd.set_option("display.max_rows", 10)
data_final = data_final.sort_values(by=subset_columns)
data_final

Unnamed: 0,model,cluster_size,num_iter,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,2,10000,High,Low,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
3,Both,2,10000,High,Low,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
5,Both,2,10000,High,Low,1.000000,0.337859,0.505075,Wed_Mar__6_12_56_55_2024
7,Both,2,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_14_05_23_2024
20,Both,2,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_13_31_31_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,Medium,High,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,Medium,High,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,Medium,High,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,Medium,High,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [262]:
import pandas as pd


condition = ~((data_final['model'] == "No Diri") & (data_final['num_iter'] == 10000) & (data_final['duplicates_level'] == 'High') & (data_final['distortion_level'] == 'High'))
data_final = data_final[condition]

condition = ~((data_final['model'] == "No Empirical") & (data_final['num_iter'] == 10000) & (data_final['duplicates_level'] == 'Medium') & (data_final['distortion_level'] == 'High'))
data_final = data_final[condition]

condition = ~((data_final['model'] == "Neither") & (data_final['num_iter'] == 10000) & (data_final['duplicates_level'] == 'Medium') & (data_final['distortion_level'] == 'High'))
data_final = data_final[condition]

condition = ~((data_final['model'] == "Neither") & (data_final['num_iter'] == 10000) & (data_final['duplicates_level'] == 'High') & (data_final['distortion_level'] == 'High'))
data_final = data_final[condition]

data_final


Unnamed: 0,model,cluster_size,num_iter,duplicates_level,distortion_level,precision,recall,f1score,date
0,Both,2,10000,High,Low,1.000000,0.336262,0.503287,Wed_Mar__6_12_25_12_2024
3,Both,2,10000,High,Low,1.000000,0.338658,0.505967,Wed_Mar__6_11_22_00_2024
5,Both,2,10000,High,Low,1.000000,0.337859,0.505075,Wed_Mar__6_12_56_55_2024
7,Both,2,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_14_05_23_2024
20,Both,2,10000,High,Low,1.000000,0.337061,0.504182,Wed_Mar__6_13_31_31_2024
...,...,...,...,...,...,...,...,...,...
3822,No Empirical,10,50000,Medium,High,0.989610,0.782341,0.873853,Tue_Mar__5_16_44_22_2024
3823,No Empirical,10,50000,Medium,High,0.987047,0.782341,0.872852,Tue_Mar__5_19_14_25_2024
3824,No Empirical,10,50000,Medium,High,0.989218,0.753593,0.855478,Tue_Mar__5_17_33_24_2024
3825,No Empirical,10,50000,Medium,High,0.984252,0.770020,0.864055,Tue_Mar__5_20_03_56_2024


In [265]:
data_BNBD = data_final

In [263]:
data_BNBD.to_csv("clean_data_BNBD.csv")