In [None]:
from google.colab import drive
drive.mount("/content/drive/")

# **Functions to run before Main**

## Get All Brands

In [None]:
def get_brand_values(data_id):
    count_brand = sum(1 for product in data_id if 'Brand' in product.get('featuresMap', {}))
    #print(count_brand)
    # Extract values associated with the 'Brand' key
    brand_values = {
        product['featuresMap']['Brand']
        for product in data_id
        if 'Brand' in product.get('featuresMap', {})
    }

    # Convert back to list
    brand_values = list(brand_values)
    return brand_values

# Clean the data and normalize the units

# (Original)

In [None]:
import re
def clean_and_normalize_original(text):

    # Replace unit forms with standardized units
    standard_inch = ['Inch', 'inches', '”', '-inch', ' inch', 'inch']
    standard_hz = ['HERTZ', 'hertz', 'Hz', 'HZ', ' hz' '-hz', 'hz']

    text = text.replace('"','inch')

    for element in standard_inch:
      text =  text.replace(element, "inch")

    for element in standard_hz:
      text =  text.replace(element, "hz")

    # Convert to lower case
    text = text.lower()

    # Remove any non-alphanumeric token in front of units inch and hz
    text = re.sub(r'[^\w]+(?=\s*(inch|hz))', '', text)

    return text.strip()


example_text = 'Newegg.com - Refurbished: Samsung 46" Class (45.9" Diag.) 1080p 240 [Hz LED HDTV UN46ES6580'

cleaned_text = clean_and_normalize_original(example_text)
print(cleaned_text)


# ("Novel")

In [None]:
import re
def clean_and_normalize_novel(text):

    # Replace unit forms with standardized units
    standard_inch = ['-inch', 'Inch', 'inches', '”', '-inch', ' inch', 'INCH', 'Inches']
    standard_hz = ['-hz', 'hertz', 'hz', 'HERTZ', 'Hertz', 'HZ', '-HZ', 'Hz']
    standard_lbs = ['lbs', 'lb', 'pounds']
    standard_w = ['watt', 'watts', 'w', 'W']
    standard_v = ['volt', 'volts', 'v', 'V']

    text = text.replace('"','inch')

    # lower case
    text = text.lower()

    for element in standard_inch:
      text =  text.replace(element, "inch")

    for element in standard_hz:
      text =  text.replace(element, "hz")

    for element in standard_lbs:
      text =  text.replace(element, "lbs")

    for element in standard_w:
      text =  text.replace(element, "w")

    for element in standard_v:
      text =  text.replace(element, "v")


    # Remove any non-alphanumeric token in front of units inch and hz
    text = re.sub(r'[^\w]+(?=\s*(hz|inch|lbs|w|v))', ' ', text)

########################################################################################

    # Attach numbers to their adjacent units (e.g., "28 hz" -> "28hz")
    # Regular expression pattern to match a number followed by a unit
    pattern = r'(\d+(\.\d+)?)\s?(hz|inch|w|lbs|v)'
    text = re.sub(pattern, r'\1\3', text)

    # Remove special characters
    # Regular expression to remove non-alphabetic characters before the decimal number + unit combination
    pattern1 = r'([^a-zA-Z0-9\s.])(\d+\.\d+)(inch|w|lbs|hz)'
    text = re.sub(pattern1, r'\2\3', text)

    # Regular expression to remove non-alphabetic characters before the integer number + unit combination
    pattern2 = r'([^a-zA-Z0-9\s.])(\d+)(inch|w|lbs|hz)'
    text = re.sub(pattern2, r'\2\3', text)

    # Remove parantheses
    # Pattern to match parentheses and brackets
    pattern = r'[\(\)\[\]]'
    text = re.sub(pattern, '', text)

    return text.strip()


example_text = 'Newegg.com - Refurbished: Samsung 46" Class (45.9" Diag.) 1080p 240 [Hz LED HDTV UN46ES6580 SunBriteTV Signature 46\" Class 46\" Diag. LCD TV 1080p HDTV 1080p White 4660HD - Best Buy'

cleaned_text = clean_and_normalize_novel(example_text)
print(cleaned_text)


# Extract model words from both Title and Features

# (Original)

In [None]:
import re

def extract_model_words_from_title_original(title):

    # Clean and normalize title
    title = clean_and_normalize_original(title)
    #print(title)

    # Regex to identify model-like patterns
    model_words = re.findall(r'([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)', title)
    #print(model_words)

    # Extract only the matched groups
    extracted_words = {match[0] for match in model_words}
    return extracted_words


def extract_model_words_from_kvp_original(features_map):

    # Combine all feature values into a single string
    features_combined = ' '.join(features_map.values())

    kvp_model_words = re.findall(r'(\d+\.\d+)[a-zA-Z]*\b', features_combined)

    # Use a set to store unique matches
    extracted_words = set(kvp_model_words)

    return extracted_words


def extract_model_words_original_BOTH(data):

    mw_per_product = []

    for entry in data:  # Directly iterate over the list of product entries
        # Extract model words from the title
        title = entry.get('title', '')  # Extract the title of each product
        title_words = extract_model_words_from_title_original(title)

        # Extract model words from the key-value pairs
        features_map = entry.get('featuresMap', {})
        kvp_words = extract_model_words_from_kvp_original(features_map)

        # Combine all extracted model words for the product
        combined_model_words = title_words.union(kvp_words)
        mw_per_product.append(combined_model_words)

    #print(mw_per_product)
    return mw_per_product

# (Novel)

In [None]:
import re
from collections import Counter

def extract_model_words_from_title_novel(title):

    # Clean and normalize title
    title = clean_and_normalize_novel(title)
    #print(title)

    # Regex to identify model-like patterns
    model_words = re.findall(r'([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)', title)
    #print(model_words)

    # Extract only the matched groups
    extracted_words = {match[0] for match in model_words}
    return extracted_words


def extract_model_words_from_kvp_novel(features_map):

    # Combine all feature values into a single string
    features_combined = ' '.join(features_map.values())
    #print(features_combined)

    kvp_model_words = re.findall(r'(\d+\.\d+)[a-zA-Z]*\b', features_combined)


    # Count occurrences of each word
    #word_counts = Counter(kvp_model_words)
    # Filter out words that occur only once
    #filtered_words = {word for word, count in word_counts.items() if count > 1}
    #kvp_model_words = filtered_words

    # Use a set to store unique matches
    extracted_words = set(kvp_model_words)

    return extracted_words


################################################### Novelty:
def extract_model_words_from_brand(features_map):

    # Get the value of the 'Brand' key or return an empty string if not found
    return features_map.get('Brand', '').lower()
###########################################


def extract_model_words_novel_BOTH(data):

    mw_per_product = []

    for entry in data:  # Iterate over the list of product entries
        # Extract model words from the title
        title = entry.get('title', '')  # Extract the title of each product
        title_words = extract_model_words_from_title_novel(title)

        # Extract model words from the key-value pairs
        features_map = entry.get('featuresMap', {})
        kvp_words = extract_model_words_from_kvp_novel(features_map)
        #print(kvp_words)

        # Extract brand value and ensure it is treated as a lowercase word
        brand_value = extract_model_words_from_brand(features_map)
        brand_words = {brand_value} if brand_value else set()  # Add brand to set only if not empty

        # Combine all extracted model words for the product
        #combined_model_words = title_words.union(kvp_words).union(brand_words)
        combined_model_words = title_words.union(brand_words)
        #combined_model_words = title_words.union(kvp_words)

        # Add the combined model words for the current product to the list
        mw_per_product.append(combined_model_words)

    #print(mw_per_product)
    return mw_per_product


# Binary Vectors (Matrix)

In [None]:
def get_binary_matrix(data, mw_per_product):

    # Step 1: Collect all unique model words across all products
    all_model_words = set()
    for mw_set in mw_per_product:
        all_model_words.update(mw_set)

    # Convert the set of all model words into a list for consistent ordering
    model_words_list = sorted(all_model_words)

    # Step 2: Initialize binary matrix
    num_products = len(data)
    num_model_words = len(model_words_list)
    print(f"Number of products: {num_products}")
    print(f"Number of model words: {num_model_words}")

    binary_matrix = np.zeros((num_products, num_model_words), dtype=int)

    # Step 3: Populate the binary matrix
    for i, mw_set in enumerate(mw_per_product):
        for mw in mw_set:
            if mw in model_words_list:
                # Find the index of the model word in the model_words_list
                mw_index = model_words_list.index(mw)
                binary_matrix[i][mw_index] = 1

    return binary_matrix, model_words_list

# Binary matrix with extra brand influence

In [None]:
def get_binary_matrix_with_brand_influence(data, mw_per_product, brand_factor, data_id):

    # Step 1: Collect all unique model words across all products
    all_model_words = set()
    for mw_set in mw_per_product:
        all_model_words.update(mw_set)

    # Convert the set of all model words into a list for consistent ordering
    model_words_list = sorted(all_model_words)

    # Step 2: Extract brand values using the existing function
    brand_values = get_brand_values(data_id)
    brand_values = [brand.lower() for brand in brand_values]  # Ensure consistency in case matching

    # Step 3: Duplicate brand columns by the specified factor
    extended_model_words_list = []
    for mw in model_words_list:
        extended_model_words_list.append(mw)
        if mw in brand_values:
            # Add duplicates of brand words
            extended_model_words_list.extend([f"{mw}_dup{i}" for i in range(1, brand_factor)])

    # Step 4: Initialize binary matrix
    num_products = len(data)
    num_model_words = len(extended_model_words_list)
    print(f"Number of products: {num_products}")
    print(f"Number of model words (with brand influence): {num_model_words}")

    binary_matrix = np.zeros((num_products, num_model_words), dtype=int)

    # Step 5: Populate the binary matrix
    for i, mw_set in enumerate(mw_per_product):
        for mw in mw_set:
            # Add original model word columns
            if mw in extended_model_words_list:
                original_index = extended_model_words_list.index(mw)
                binary_matrix[i][original_index] = 1

                # Populate duplicated brand columns
                if mw in brand_values:
                    for j in range(1, brand_factor):
                        dup_index = extended_model_words_list.index(f"{mw}_dup{j}")
                        binary_matrix[i][dup_index] = 1

    return binary_matrix, extended_model_words_list

### Extra: Cut the columns (model words) which occur only once (Not used)

In [None]:
def filter_binary_matrix(binary_matrix):
    # Step 1: Count occurrences of each model word
    column_sums = np.sum(binary_matrix, axis=0)

# Step 2: Identify columns to keep (where occurrences > 1)
    columns_to_keep = column_sums > 1

# Step 3: Filter binary matrix to remove columns where sum is 1
    filtered_binary_matrix = binary_matrix[:, columns_to_keep]

    #print("Original Binary Matrix:\n", binary_matrix)
    print("Original binary matix size: ", binary_matrix.shape)
#print("Filtered Binary Matrix:\n", filtered_binary_matrix)
    print("Filtered binayr matrix size: ", filtered_binary_matrix.shape)

    return filtered_binary_matrix

# Signature Vectors (Matrix)

In [None]:
import numpy as np
import random

def generate_unique_hash_params(k, prime):

    np.random.seed(seed)  # For reproducibility
    hash_params = set()

    while len(hash_params) < k:
        a = np.random.randint(0, prime-1)
        d = np.random.randint(1, prime-1)
        hash_params.add((a, d))  # Add only unique pairs

    #print("(a,d) parameters")
    #print(list(hash_params))
    return list(hash_params)


def hash_function(row, a, d, prime):

    return (a + d * row) % prime

def minhash_signature_matrix(used_binary_matrix, k, prime):

    num_products, num_model_words = used_binary_matrix.shape
    print(used_binary_matrix.shape)
    w = num_model_words  # Number of rows in the binary matrix
    #print(w)
    #print("num_products")
    #print(num_products)

    # Step 1: Generate k random hash functions
    permutations = generate_unique_hash_params(k, prime)

    # Step 2: Initialize the signature matrix with infinity
    signature_matrix = np.full((k, num_products), np.inf)

    # Step 3: Compute signature matrix
    for row in range(w):
        for perm_index, (a, d) in enumerate(permutations):
            hash_value = hash_function(row, a, d, prime)
            #print("hash value:")
            #print(hash_value)
            for product_index in range(num_products):
                if used_binary_matrix[product_index, row] == 1:  # Only consider rows with 1
                    signature_matrix[perm_index, product_index] = min(signature_matrix[perm_index, product_index], hash_value)

    return signature_matrix.astype(int)

# Check for hash collisions

In [None]:
import numpy as np
from collections import defaultdict

def count_collisions(signature_matrix):

    total_collisions = 0

    # Iterate over each permutation (row in signature matrix)
    for perm_index in range(signature_matrix.shape[0]):
        # Dictionary to track hash values and their counts
        hash_counts = defaultdict(int)

        # Iterate over each product (column in signature matrix)
        for product_index in range(signature_matrix.shape[1]):
            hash_value = signature_matrix[perm_index, product_index]
            hash_counts[hash_value] += 1

        # Count collisions: if any hash value appears more than once, we have a collision
        for count in hash_counts.values():
            if count > 1:
                # Add the number of collisions for this hash value to the total
                total_collisions += count - 1  # Subtract 1 because the first occurrence is not a collision

    return total_collisions

# Count the collisions in the signature matrix
# num_collisions = count_collisions(signature_matrix)
# print(f"Total hash collisions: {num_collisions}")


# Another check for hash collisions

In [None]:
def analyze_collisions(signature_matrix):
    num_collisions = 0
    for i in range(signature_matrix.shape[1]):  # Iterate over products
        unique_hashes = np.unique(signature_matrix[:, i])
        num_collisions += (signature_matrix.shape[0] - len(unique_hashes))
    print(f"Total hash collisions: {num_collisions}")
    return num_collisions
# analyze_collisions(signature_matrix)

# **LSH**

In [None]:
import numpy as np
from collections import defaultdict

def jaccard_similarity(vec1, vec2):

    intersection = np.sum(np.logical_and(vec1, vec2))  # Number of common 1s
    union = np.sum(np.logical_or(vec1, vec2))  # Total number of 1s in either vector
    return intersection / union if union != 0 else 0


def lsh(b, r, given_signature_matrix):

    num_hashes, num_products = given_signature_matrix.shape
    assert num_hashes == b * r, "Number of hash functions (k) should be equal to b * r"

    # Step 1: Split the signature matrix into b bands, each with r rows
    bands = [given_signature_matrix[i * r:(i + 1) * r] for i in range(b)]

    # Step 2: Hash products into buckets per band
    candidate_pairs = set()
    for band_index in range(b):
        band = bands[band_index]
        hash_buckets = defaultdict(list)

        # For each product, hash the band and assign to a bucket
        for product_index in range(num_products):
            # Convert the band (vector) into a tuple to hash
            band_tuple = tuple(band[:, product_index])  # This is the key for the hash function
            bucket_key = hash(band_tuple)  # Create a hash for the tuple

            # Store the product in the appropriate bucket
            for product_in_bucket in hash_buckets[bucket_key]:
                # If two products are in the same bucket, they are candidate pairs
                candidate_pairs.add(tuple(sorted([product_index, product_in_bucket])))

            # Add the current product to the bucket
            hash_buckets[bucket_key].append(product_index)
    return candidate_pairs

# True pairs (duplicates)

In [None]:
from itertools import combinations
from collections import defaultdict

def get_true_pairs_from_list(data_id):

    # Step 1: Group products by modelID
    model_groups = defaultdict(list)
    for index, product in enumerate(data_id):
        modelID = product['modelID']
        model_groups[modelID].append(index)

    # Step 2: Generate pairs within each group
    true_pairs = []
    for modelID, product_indices in model_groups.items():
        true_pairs.extend(combinations(product_indices, 2))

    return true_pairs

# From Pairs to Unique Duplicates found (Not used)

In [None]:
def get_duplicates_from_pairs(pairs_of_duplicates):
  # Step 3: Find unique duplicates
    # Create a set of all unique product indices involved in duplicates
    found_unique_duplicates = set()
    for pair in pairs_of_duplicates:
        found_unique_duplicates.update(pair)

    # The total number of unique products involved in duplicates
    number_duplicates = len(found_unique_duplicates)

    return (number_duplicates, found_unique_duplicates)

# number_true_unique_duplicates, true_unique_duplicates = get_duplicates_from_pairs(true_pairs)

# print(number_true_unique_duplicates)
# print(true_unique_duplicates)


# Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
import numpy as np

def hierarchical_clustering(data, used_binary_matrix, b, r, given_signature_matrix, used_method):

    threshold = (1 / b) ** (1 / r)

    # Step 1: Use LSH to get candidate pairs
    candidate_pairs = lsh(b, r, given_signature_matrix)
    print(candidate_pairs)
    print(len(candidate_pairs))

    # Step 2: Filter based on brand
    filtered_pairs = []
    for p1, p2 in candidate_pairs:
        brand1 = data[p1]['featuresMap'].get('Brand', '').lower()
        brand2 = data[p2]['featuresMap'].get('Brand', '').lower()
        if brand1 == brand2 or brand1 == '' or brand2 == '':
            filtered_pairs.append((p1, p2))
    print(len(filtered_pairs))

    # Step 3: Filter based on shop
    filtered_pairs_2 = []
    for prod1, prod2 in filtered_pairs:
        shop1 = data[prod1].get('shop', '').lower()
        shop2 = data[prod2].get('shop', '').lower()
        if shop1 != shop2:
            filtered_pairs_2.append((prod1, prod2))
    print(len(filtered_pairs_2))
    #number_comparisons_clusters = len(filtered_pairs_2)

    # Step 4: Compute similarity matrix for candidate products
    product_indices = set(prod for pair in filtered_pairs_2 for prod in pair)
    product_indices = sorted(product_indices)  # Ensure consistent ordering
    index_to_product = {i: product for i, product in enumerate(product_indices)}
    product_to_index = {product: i for i, product in enumerate(product_indices)}

    num_products = len(product_indices)
    similarity_matrix = np.zeros((num_products, num_products))

    for i in range(num_products):
        for j in range(i + 1, num_products):
            prod1 = index_to_product[i]
            prod2 = index_to_product[j]
            similarity = jaccard_similarity(used_binary_matrix[prod1, :], used_binary_matrix[prod2, :])
            #print(prod1, prod2)
            #print(similarity)
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity  # Symmetric matrix

    # Check if the similarity matrix is empty
    if similarity_matrix.size == 0:
        return []  # Return an empty result without any message

    # Step 5: Perform hierarchical clustering
    # Convert similarity matrix to a distance matrix
    distance_matrix = 1 - similarity_matrix
    #print(similarity_matrix)
    print(similarity_matrix.shape)
    #print(distance_matrix)
    # Ensure the diagonal of the distance matrix is zero
    np.fill_diagonal(distance_matrix, 0)
    condensed_distance_matrix = squareform(distance_matrix)  # Convert to condensed form
    linkage_matrix = linkage(condensed_distance_matrix, method=used_method) # 'average' 'complete' 'single'

    # Step 6: Form clusters based on threshold
    # The threshold is on the distance, so we use (1 - threshold) as the cutoff
    clusters = fcluster(linkage_matrix, t = 1 - threshold, criterion='distance')

    # Step 7: Group products into clusters
    cluster_dict = {}
    for product_index, cluster_id in zip(product_indices, clusters):
        if cluster_id not in cluster_dict:
            cluster_dict[cluster_id] = []
        cluster_dict[cluster_id].append(product_index)

    # Only keep clusters with more than one product as valid duplicates
    valid_clusters = [cluster for cluster in cluster_dict.values() if len(cluster) > 1]

    return valid_clusters

# Get pairs from clusters

In [None]:
from itertools import combinations

def get_pairs_from_clusters(clusters):
    pairs = []

    for cluster in clusters:
        # Generate all possible pairs from the cluster
        cluster_pairs = combinations(cluster, 2)
        # Append the pairs to the list
        pairs.extend(cluster_pairs)

    return pairs

# Post Cluster Filtering (Not used)

In [None]:
def extra_filter_cluster(b,r, given_pairs):

    # Filter based on brand : different brand - no pair
    filtered_pairs = []
    for p1, p2 in given_pairs: # get_pairs_from_clusters(clusters)
        #print(p1)
        #print(len(data))
        # Get brand of the products, handle the case where 'Brand' key is missing
        brand1 = data[p1]['featuresMap'].get('Brand', '').lower()
        #print(brand1)
        brand2 = data[p2]['featuresMap'].get('Brand', '').lower()  # Default to empty string if 'Brand' key is missing
        #print(brand2)

        if brand1 == brand2 or brand1 == '' or brand2 == '':  # Consider missing brands as the same
            # Only consider pairs with the same brand or where one of the products has no brand
            filtered_pairs.append((p1, p2))
    print(len(filtered_pairs))

    # Filter based on shop : same shop - no pair
    filtered_pairs_2 = []
    for prod1, prod2 in filtered_pairs:
        #print(prod1)
        # Get brand of the products, handle the case where 'Brand' key is missing
        shop1 = data[prod1].get('shop', '').lower()
        #print(shop1)
        shop2 = data[prod2].get('shop', '').lower()  # Default to empty string if 'Brand' key is missing
        #print(shop2)

        if shop1  != shop2:  # Consider missing brands as the same
            # Only consider pairs with the same brand or where one of the products has no brand
            filtered_pairs_2.append((prod1, prod2))

    print(len(filtered_pairs_2))
    return filtered_pairs_2

# Evaluate Model

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def bootstrap(data, data_id, n_bootstraps):

    train_sets = []
    test_sets = []
    train_id_sets = []
    test_id_sets = []
    number_products_test = []
    number_products_train = []
    n = len(data)

    for _ in range(n_bootstraps):
        # Number of unique samples for bootstrapping (~63% of the data)
        train_size = int(n * 0.63)

        # Sample 63% unique indices from the original data (no replacement)
        train_indices = np.random.choice(n, size=train_size, replace=False)

        # The remaining indices form the test set (the out-of-sample data)
        test_indices = [i for i in range(n) if i not in train_indices]

        # Sample the remaining 37% from the unique indices (with replacement)
        #remaining_indices = np.random.choice(unique_train_indices, size=n - train_size, replace=True)

        # Combine unique and remaining indices to form the bootstrapped dataset (total size = n)
        #train_indices = list(unique_train_indices) #+ list(remaining_indices)

        # Store the number of products in the test set
        number_products_test.append(len(test_indices))
        number_products_train.append(len(train_indices))

        ################### data sets for clean data
        # Create the training and test sets based on indices
        train_set = [data[i] for i in train_indices]
        test_set = [data[i] for i in test_indices]

        # Append to the respective lists
        train_sets.append(train_set)
        test_sets.append(test_set)

        ################### data sets with still model ID info
        # Create the training and test sets based on indices
        train_id_set = [data_id[i] for i in train_indices]
        test_id_set = [data_id[i] for i in test_indices]

        # Append to the respective lists
        train_id_sets.append(train_id_set)
        test_id_sets.append(test_id_set)

    return train_sets, test_sets, train_id_sets, test_id_sets, number_products_train, number_products_test

def evaluate_performance(used_pred_pairs_lsh, used_pred_pairs_clusters, used_true_pairs, used_current_possible_pairs): # for test data

    # Calculate TP, FP, TN, FN
    # For Clusters
    TP = len(set(used_pred_pairs_clusters).intersection(used_true_pairs))
    print("true positives", TP)
    FP = len(used_pred_pairs_clusters) - TP # is non-duplicate, but predicted as duplicates
    print("false positives", FP)
    FN = len(used_true_pairs) - TP # it's duplicate, but is predicted as non-duplicate
    print("false negatives", FN,  "too many FN?")
    TN = used_current_possible_pairs - len(used_pred_pairs_clusters) - len(used_true_pairs) # NOT in the pred_pairs (in the ^pred_pairs), and NOT in true_pairs (in the ^true_pairs)
    print("true negatives", TN)

    # Calculate F1-measure for Clusters
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1_measure = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    # Calculate Pair Quality and Pair Completeness, F1* for LSH
    number_comparisons = len(used_pred_pairs_lsh) # number of candidate pairs given by LSH, for test data
    print("number comparisons, should equal number test_candidate_pairs (lsh) ", number_comparisons)

    pair_quality = TP / number_comparisons if number_comparisons > 0 else 0 # number_comparisons = len(test_candidate_pairs) = len(test_pred_pairs_lsh)
    total_duplicates = len(used_true_pairs) # number of real duplicates in the whole test data
    pair_completeness = TP / total_duplicates if total_duplicates > 0 else 0
    f1_star = 2 * (pair_quality * pair_completeness) / (pair_quality + pair_completeness) if pair_quality + pair_completeness != 0 else 0

    return f1_measure, pair_quality, pair_completeness, f1_star

# Main run (Initialise)

In [None]:
import json
import pandas as pd
import numpy as np
import re
from collections import Counter
import random
from sympy import nextprime
from collections import defaultdict
from itertools import combinations
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import precision_recall_fscore_support

seed = 987654
np.random.seed(seed)

#all_possible_pairs = (1624 * 1623) / 2

data_path = '/content/drive/.../TVs-all-merged.json'
with open(data_path, 'r') as file:
    data_json = json.load(file)
#print(data_json)
print(len(data_json))

# To store the individual products
product_list = []

# Iterate through the data to process each product under the same model ID
for model_id, products in data_json.items():
    for product in products:
        # Add each product to the product_list, preserving all the information
        product_list.append(product)

# Print the individual products
print("Total number of products:", len(product_list))
#for product in product_list:
 #   print(product)

data_id = product_list # Data still including modelID
#print(len(data_id))
#print(data_id)
#print(data_json)

# Select the first x products from your data
#data_id = data_id[:800] #!!!!!!!!!!

# Separate modelID from the rest, as modelID FOR EVALUATION ONLY, not testing
training_data = []  # For the training set (excluding modelID)
evaluation_data = {}  # For the testing/evaluation (modelID mapping)

# Iterate through the dataset to process each product in the list
for entry in data_id:
    # Store the modelID for evaluation
    model_id = entry.get("modelID")
    if model_id not in evaluation_data:
        evaluation_data[model_id] = []
    evaluation_data[model_id].append(entry)  # Append entry for evaluation purposes

    # Prepare training data by excluding modelID
    training_entry = entry.copy()
    training_entry.pop("modelID", None)
    training_data.append(training_entry)

#print("Training Data:")
#print(training_data)
# Count the number of products in training_data
product_count = len(training_data)

print(f"Number of products in training_data: {product_count}")

data = training_data  # Data with modelID excluded

############################################################
# true_pairs = get_true_pairs_from_list(data_id)
# print("True Pairs:", true_pairs)
# print("Number True Pairs:", len(true_pairs))

n_bootstraps = 5 #5
# List of b values to iterate over
train_b_values = [1, 2, 3, 5, 6, 10, 15, 17, 30, 34, 51, 85, 102, 170, 255, 510]
number_b_train = len(train_b_values)

test_b_values = [1, 2, 3, 4, 5, 6, 9, 10, 12, 15, 18, 20, 25, 30, 36, 45, 50, 60, 75, 90, 100, 150, 180, 225, 300, 450, 900]
number_b_test = len(test_b_values)

# Create a data structure to hold results: a 2D list of dictionaries
# Rows correspond to b values, columns correspond to bootstraps
single_train_results_matrix = [[{"f1_measures": [], "f1_stars": [], "pair_qualities": [], "pair_completenesses": [], "fraction_of_comparisons": []}
                   for _ in range(n_bootstraps)] for _ in range(number_b_train)]

average_train_results_matrix = [[{"f1_measures": [], "f1_stars": [], "pair_qualities": [], "pair_completenesses": [], "fraction_of_comparisons": []}
                   for _ in range(n_bootstraps)] for _ in range(number_b_train)]

complete_train_results_matrix = [[{"f1_measures": [], "f1_stars": [], "pair_qualities": [], "pair_completenesses": [], "fraction_of_comparisons": []}
                   for _ in range(n_bootstraps)] for _ in range(number_b_train)]

test_results_matrix = [[{"f1_measures": [], "f1_stars": [], "pair_qualities": [], "pair_completenesses": [], "fraction_of_comparisons": []}
                   for _ in range(n_bootstraps)] for _ in range(number_b_test)]



train_data_sets, test_data_sets, id_train_data_sets, id_test_data_sets, train_number_of_products, test_number_of_products = bootstrap(data, data_id, n_bootstraps)
print("train data sets ", train_data_sets)

# Main Run (Train Data)

In [None]:
############### DO FOR TRAIN DATA FIRST:   ################
###########################################################
# do model words, binary, signature, lsh: on train data, with train_k and train_b, train_r => train_threshold : SAME FOR ALL CLUSTERING OPTIONS - RUN ONLY ONCE FOR TRAIN
# do clustering 3 options: single (average, complete) => get average performance measures over bootstraps for option 1 - all for TRAIN (inc. fraction of comparisons)
# do for options 2, 3 - record all 3 results - don't overwrite
# select the 'method' based on the best performance: higher PQ, higher PC, higher F1*

for bootstrap_index, train_data in enumerate(train_data_sets):
    # Get the corresponding test data and number of products for this bootstrap
    #current_test_data = test_data_sets[bootstrap_index]
    current_train_data = train_data_sets[bootstrap_index]
    #print("current train data ", current_train_data)
    current_train_number_unique_products = train_number_of_products[bootstrap_index]
    #current_test_number_unique_products = test_number_of_products[bootstrap_index]

    current_id_train_data_sets = id_train_data_sets[bootstrap_index] # data_id for train data
    #current_id_test_data_sets = id_test_data_sets[bootstrap_index] # data_id for test data

    train_true_pairs = get_true_pairs_from_list(current_id_train_data_sets)
    #test_true_pairs = get_true_pairs_from_list(current_id_test_data_sets)
    #print("Train true Pairs:", train_true_pairs)
    #print("Number Train True Pairs:", len(train_true_pairs))
    # print("Test true Pairs:", test_true_pairs)
    # print("Number Test True Pairs:", len(test_true_pairs))

    # Print the current number of unique products in the used data
    #print("current_train_number_unique_products:", current_train_number_unique_products)

    # Calculate the number of possible pairs for the used data
    current_possible_pairs = current_train_number_unique_products * (current_train_number_unique_products - 1) // 2
    #print("current_possible_pairs:", current_possible_pairs)


    ####################################################################################
    #train_model_words_data_novel = extract_model_words_novel(current_train_data) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! change type model words
    train_model_words_data_novel_BOTH = extract_model_words_novel_BOTH(current_train_data)
    #train_model_words_data_original = extract_model_words_original_BOTH(current_train_data)

    train_binary_matrix, train_model_words_list = get_binary_matrix(current_train_data, train_model_words_data_novel_BOTH) ######################################################################
    #train_binary_matrix, train_model_words_list = get_binary_matrix_with_brand_influence(current_train_data, train_model_words_data_novel_BOTH, 1, current_id_train_data_sets)

    #train_binary_matrix = filter_binary_matrix(train_binary_matrix) ################################################################################################################

    train_num_model_words = train_binary_matrix.shape[1]
    print("train num_model_words")
    print(train_num_model_words, len(train_model_words_list))

    # k_test = 112 #num_model_words // 2  # Size of signature vector is half the binary vector size !! change here
    k_train = 510

    prime_train = nextprime(3*train_num_model_words)
    print("prime train:")
    print(prime_train)

    train_signature_matrix = minhash_signature_matrix(train_binary_matrix, k_train, prime_train)  # FILTERED OR BINARY!!!!!!!!!!!!!!!! change also here
    #print("Train Signature Matrix:")
    #print(train_signature_matrix)
    #print("Train Sginature matrix sizes: ", train_signature_matrix.shape)

    for b_index, b in enumerate(train_b_values):

        # Calculate r and the corresponding threshold
        r_train = k_train // b
        threshold = (1 / b) ** (1 / r_train)
        print(f"b: {b}, r: {r_train}, threshold: {threshold}")

        # Perform LSH to find candidate pairs
        train_candidate_pairs = lsh(b, r_train, train_signature_matrix)
        #print(f"Candidate pairs: {candidate_pairs}")
        print(f"Number train candidate pairs: {len(train_candidate_pairs)}")

        number_comparisons = len(train_candidate_pairs) # for test ONLY
        fraction_of_comparisons = number_comparisons/current_possible_pairs # for teset ONLY
        print("Number of comparisons:", number_comparisons)

        fraction_of_comparisons = number_comparisons/current_possible_pairs

        # Perform hierarchical clustering
        clusters_single = hierarchical_clustering(current_train_data, train_binary_matrix, b, r_train, train_signature_matrix, 'single')
        clusters_average = hierarchical_clustering(current_train_data, train_binary_matrix, b, r_train, train_signature_matrix, 'average')
        clusters_complete = hierarchical_clustering(current_train_data, train_binary_matrix, b, r_train, train_signature_matrix, 'complete')
        # print("Clusters of duplicates:")
        # for cluster in clusters:
        #     print(cluster)

        # Get pairs from clusters
        single_cluster_pairs = get_pairs_from_clusters(clusters_single)
        average_cluster_pairs = get_pairs_from_clusters(clusters_average)
        complete_cluster_pairs = get_pairs_from_clusters(clusters_complete)

        # Evaluate performance
        single_f1_measure, single_pair_quality, single_pair_completeness, single_f1_star = evaluate_performance(train_candidate_pairs, single_cluster_pairs, train_true_pairs, current_possible_pairs)
        average_f1_measure, average_pair_quality, average_pair_completeness, average_f1_star = evaluate_performance(train_candidate_pairs, average_cluster_pairs, train_true_pairs, current_possible_pairs)
        complete_f1_measure, complete_pair_quality, complete_pair_completeness, complete_f1_star = evaluate_performance(train_candidate_pairs, complete_cluster_pairs, train_true_pairs, current_possible_pairs)

        # Store the performance metrics for this bootstrap and this b value
        # Store the performance metrics in the corresponding matrix entry
        single_train_results_matrix[b_index][bootstrap_index]["f1_measures"].append(single_f1_measure)
        single_train_results_matrix[b_index][bootstrap_index]["f1_stars"].append(single_f1_star)
        single_train_results_matrix[b_index][bootstrap_index]["pair_qualities"].append(single_pair_quality)
        single_train_results_matrix[b_index][bootstrap_index]["pair_completenesses"].append(single_pair_completeness)

        single_train_results_matrix[b_index][bootstrap_index]["fraction_of_comparisons"] = fraction_of_comparisons

        average_train_results_matrix[b_index][bootstrap_index]["f1_measures"].append(average_f1_measure)
        average_train_results_matrix[b_index][bootstrap_index]["f1_stars"].append(average_f1_star)
        average_train_results_matrix[b_index][bootstrap_index]["pair_qualities"].append(average_pair_quality)
        average_train_results_matrix[b_index][bootstrap_index]["pair_completenesses"].append(average_pair_completeness)

        average_train_results_matrix[b_index][bootstrap_index]["fraction_of_comparisons"] = fraction_of_comparisons

        complete_train_results_matrix[b_index][bootstrap_index]["f1_measures"].append(complete_f1_measure)
        complete_train_results_matrix[b_index][bootstrap_index]["f1_stars"].append(complete_f1_star)
        complete_train_results_matrix[b_index][bootstrap_index]["pair_qualities"].append(complete_pair_quality)
        complete_train_results_matrix[b_index][bootstrap_index]["pair_completenesses"].append(complete_pair_completeness)

        complete_train_results_matrix[b_index][bootstrap_index]["fraction_of_comparisons"] = fraction_of_comparisons

        #print("results matrix ", results_matrix)
        #print("RESULTS MATRIX SIZE, AFTER EACH B", len(results_matrix))

# Calculate the average results across bootstraps for each b
single_final_results = []
for b_index, b in enumerate(train_b_values):
    avg_f1_measure = np.mean([np.mean(single_train_results_matrix[b_index][bootstrap]["f1_measures"]) for bootstrap in range(n_bootstraps)])
    avg_f1_star = np.mean([np.mean(single_train_results_matrix[b_index][bootstrap]["f1_stars"]) for bootstrap in range(n_bootstraps)])
    avg_pair_quality = np.mean([np.mean(single_train_results_matrix[b_index][bootstrap]["pair_qualities"]) for bootstrap in range(n_bootstraps)])
    avg_pair_completeness = np.mean([np.mean(single_train_results_matrix[b_index][bootstrap]["pair_completenesses"]) for bootstrap in range(n_bootstraps)])
    avg_fraction_of_comparisons = np.mean([single_train_results_matrix[b_index][bootstrap]["fraction_of_comparisons"] for bootstrap in range(n_bootstraps)])
    #print("for this b, we  get this avg g1 star ", b, avg_f1_star)

    # Store the averaged results for the current b
    single_final_results.append({
        "b": b,
        "avg_f1_measure": avg_f1_measure,
        "avg_f1_star": avg_f1_star,
        "avg_pair_quality": avg_pair_quality,
        "avg_pair_completeness": avg_pair_completeness,
        "fraction_of_comparisons": avg_fraction_of_comparisons,
    })

average_final_results = []
for b_index, b in enumerate(train_b_values):
    avg_f1_measure = np.mean([np.mean(average_train_results_matrix[b_index][bootstrap]["f1_measures"]) for bootstrap in range(n_bootstraps)])
    avg_f1_star = np.mean([np.mean(average_train_results_matrix[b_index][bootstrap]["f1_stars"]) for bootstrap in range(n_bootstraps)])
    avg_pair_quality = np.mean([np.mean(average_train_results_matrix[b_index][bootstrap]["pair_qualities"]) for bootstrap in range(n_bootstraps)])
    avg_pair_completeness = np.mean([np.mean(average_train_results_matrix[b_index][bootstrap]["pair_completenesses"]) for bootstrap in range(n_bootstraps)])
    avg_fraction_of_comparisons = np.mean([average_train_results_matrix[b_index][bootstrap]["fraction_of_comparisons"] for bootstrap in range(n_bootstraps)])
    #print("for this b, we  get this avg g1 star ", b, avg_f1_star)

    # Store the averaged results for the current b
    average_final_results.append({
        "b": b,
        "avg_f1_measure": avg_f1_measure,
        "avg_f1_star": avg_f1_star,
        "avg_pair_quality": avg_pair_quality,
        "avg_pair_completeness": avg_pair_completeness,
        "fraction_of_comparisons": avg_fraction_of_comparisons,
    })

complete_final_results = []
for b_index, b in enumerate(train_b_values):
    avg_f1_measure = np.mean([np.mean(complete_train_results_matrix[b_index][bootstrap]["f1_measures"]) for bootstrap in range(n_bootstraps)])
    avg_f1_star = np.mean([np.mean(complete_train_results_matrix[b_index][bootstrap]["f1_stars"]) for bootstrap in range(n_bootstraps)])
    avg_pair_quality = np.mean([np.mean(complete_train_results_matrix[b_index][bootstrap]["pair_qualities"]) for bootstrap in range(n_bootstraps)])
    avg_pair_completeness = np.mean([np.mean(complete_train_results_matrix[b_index][bootstrap]["pair_completenesses"]) for bootstrap in range(n_bootstraps)])
    avg_fraction_of_comparisons = np.mean([complete_train_results_matrix[b_index][bootstrap]["fraction_of_comparisons"] for bootstrap in range(n_bootstraps)])
    #print("for this b, we  get this avg g1 star ", b, avg_f1_star)

    # Store the averaged results for the current b
    complete_final_results.append({
        "b": b,
        "avg_f1_measure": avg_f1_measure,
        "avg_f1_star": avg_f1_star,
        "avg_pair_quality": avg_pair_quality,
        "avg_pair_completeness": avg_pair_completeness,
        "fraction_of_comparisons": avg_fraction_of_comparisons,
    })



print("Single ", single_final_results)
print("Average ", average_final_results)
print("Complete ", complete_final_results)

# Up Main Main

# Comparison Train Graphs

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert the results of each linkage method to DataFrames
average_results_df = pd.DataFrame(average_final_results)
complete_results_df = pd.DataFrame(complete_final_results)
single_results_df = pd.DataFrame(single_final_results)

# Generate Graphs
plt.figure(figsize=(15, 10))

# Graph 1: Fraction of Comparisons vs Average F1 Measure
plt.subplot(2, 2, 4)
plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_f1_measure'], marker='o', label='Average Linkage')
plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_f1_measure'], marker='o', label='Complete Linkage')
plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_f1_measure'], marker='o', label='Single Linkage')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average F1 Measure')
plt.title('Fraction of Comparisons vs F1 Measure')
plt.legend()

# Graph 2: Fraction of Comparisons vs Average F1* Measure
plt.subplot(2, 2, 3)
plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_f1_star'], marker='o', label='Average Linkage')
plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_f1_star'], marker='o', label='Complete Linkage')
plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_f1_star'], marker='o', label='Single Linkage')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average F1* Measure')
plt.title('Fraction of Comparisons vs F1* Measure')
plt.legend()

# Graph 3: Fraction of Comparisons vs Average Pair Quality
plt.subplot(2, 2, 1)
plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_pair_quality'], marker='o', label='Average Linkage')
plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_pair_quality'], marker='o', label='Complete Linkage')
plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_pair_quality'], marker='o', label='Single Linkage')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average Pair Quality')
plt.title('Fraction of Comparisons vs Pair Quality')
plt.legend()

# Graph 4: Fraction of Comparisons vs Average Pair Completeness
plt.subplot(2, 2, 2)
plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_pair_completeness'], marker='o', label='Average Linkage')
plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_pair_completeness'], marker='o', label='Complete Linkage')
plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_pair_completeness'], marker='o', label='Single Linkage')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average Pair Completeness')
plt.title('Fraction of Comparisons vs Pair Completeness')
plt.legend()

# Graph 5: Fraction of Comparisons vs Average F1 Measure & F1* Measure
# plt.subplot(2, 3, 5)
# plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_f1_measure'], marker='o', label='Average F1 Measure (Average Linkage)')
# plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_f1_measure'], marker='o', label='Average F1 Measure (Complete Linkage)')
# plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_f1_measure'], marker='o', label='Average F1 Measure (Single Linkage)', linestyle='--')

# plt.plot(average_results_df['fraction_of_comparisons'], average_results_df['avg_f1_star'], marker='o', label='F1* Measure (Average Linkage)', linestyle=':')
# plt.plot(complete_results_df['fraction_of_comparisons'], complete_results_df['avg_f1_star'], marker='o', label='F1* Measure (Complete Linkage)', linestyle=':')
# plt.plot(single_results_df['fraction_of_comparisons'], single_results_df['avg_f1_star'], marker='o', label='F1* Measure (Single Linkage)', linestyle=':')

# plt.xlabel('Fraction of Comparisons')
# plt.ylabel('F1 Measures')
# plt.title('Fraction of Comparisons vs F1 & F1* Measures')
# plt.legend()

# Adjust layout and show
plt.tight_layout()
plt.show()


# Main Run (Test Data)
## Run on the Test data based on the chosen best 'method' (single vs average vs complete) -> 'single'

In [None]:
for bootstrap_index, test_data in enumerate(test_data_sets):
    # Get the corresponding test data and number of products for this bootstrap
    current_test_data = test_data_sets[bootstrap_index]
    #print("current test data ", current_test_data)
    current_test_number_unique_products = test_number_of_products[bootstrap_index]

    current_id_test_data_sets = id_test_data_sets[bootstrap_index] # data_id for test data

    test_true_pairs = get_true_pairs_from_list(current_id_test_data_sets)

    #print("test true Pairs:", test_true_pairs)
    print("Number test True Pairs:", len(test_true_pairs))

    # Print the current number of unique products in the used data
    print("current_test_number_unique_products:", current_test_number_unique_products)

    # Calculate the number of possible pairs for the used data
    current_possible_pairs = current_test_number_unique_products * (current_test_number_unique_products - 1) // 2
    print("current_possible_pairs:", current_possible_pairs)


    ####################################################################################
    test_model_words_data_novel_BOTH = extract_model_words_novel_BOTH(current_test_data)
    #test_model_words_data_original_BOTH = extract_model_words_original_BOTH(current_test_data)

    test_binary_matrix, test_model_words_list = get_binary_matrix(current_test_data, test_model_words_data_novel_BOTH) ######################################################################
    #test_binary_matrix, test_model_words_list = get_binary_matrix(current_test_data, test_model_words_data_original_BOTH)
    #test_binary_matrix, test_model_words_list = get_binary_matrix_with_brand_influence(current_test_data, test_model_words_data_novel_BOTH, 2, current_id_test_data_sets)

    #test_binary_matrix = filter_binary_matrix(test_binary_matrix) ######################################################### Cut columns (model words) where sum = 1

    test_num_model_words = test_binary_matrix.shape[1]
    print("test num_model_words")
    print(test_num_model_words, len(test_model_words_list))

    k_test = 900 #num_model_words // 2  # Size of signature vector is half the binary vector size !! change here

    prime_test = nextprime(3*test_num_model_words)
    print("prime test:")
    print(prime_test)

    test_signature_matrix = minhash_signature_matrix(test_binary_matrix, k_test, prime_test)  # FILTERED OR BINARY!!!!!!!!!!!!!!!! change also here
    #print("test Signature Matrix:")
    #print(test_signature_matrix)
    print("test Sginature matrix sizes: ", test_signature_matrix.shape)

    for b_index, b in enumerate(test_b_values):

        # Calculate r and the corresponding threshold
        r_test = k_test // b
        threshold = (1 / b) ** (1 / r)
        print(f"b: {b}, r: {r_test}, threshold: {threshold}")

        # Perform LSH to find candidate pairs
        test_candidate_pairs = lsh(b, r_test, test_signature_matrix)
        #print(f"Candidate pairs: {candidate_pairs}")
        print(f"Number test candidate pairs: {len(test_candidate_pairs)}")

        number_comparisons = len(test_candidate_pairs) # for test ONLY
        fraction_of_comparisons = number_comparisons/current_possible_pairs # for teset ONLY
        print("Number of comparisons:", number_comparisons)

        fraction_of_comparisons = number_comparisons/current_possible_pairs

        # Perform hierarchical clustering
        clusters_single = hierarchical_clustering(current_test_data, test_binary_matrix, b, r_test, test_signature_matrix, 'single')

        # Get pairs from clusters
        single_cluster_pairs = get_pairs_from_clusters(clusters_single)

        # Evaluate performance
        single_f1_measure, single_pair_quality, single_pair_completeness, single_f1_star = evaluate_performance(test_candidate_pairs, single_cluster_pairs, test_true_pairs, current_possible_pairs)

        # Store the performance metrics for this bootstrap and this b value
        # Store the performance metrics in the corresponding matrix entry
        test_results_matrix[b_index][bootstrap_index]["f1_measures"].append(single_f1_measure)
        test_results_matrix[b_index][bootstrap_index]["f1_stars"].append(single_f1_star)
        test_results_matrix[b_index][bootstrap_index]["pair_qualities"].append(single_pair_quality)
        test_results_matrix[b_index][bootstrap_index]["pair_completenesses"].append(single_pair_completeness)
        # Add fraction of comparisons to results_matrix
        test_results_matrix[b_index][bootstrap_index]["fraction_of_comparisons"] = fraction_of_comparisons

# Calculate the average results across bootstraps for each b
test_final_results = []
for b_index, b in enumerate(test_b_values):
    avg_f1_measure = np.mean([np.mean(test_results_matrix[b_index][bootstrap]["f1_measures"]) for bootstrap in range(n_bootstraps)])
    avg_f1_star = np.mean([np.mean(test_results_matrix[b_index][bootstrap]["f1_stars"]) for bootstrap in range(n_bootstraps)])
    avg_pair_quality = np.mean([np.mean(test_results_matrix[b_index][bootstrap]["pair_qualities"]) for bootstrap in range(n_bootstraps)])
    avg_pair_completeness = np.mean([np.mean(test_results_matrix[b_index][bootstrap]["pair_completenesses"]) for bootstrap in range(n_bootstraps)])
    avg_fraction_of_comparisons = np.mean([test_results_matrix[b_index][bootstrap]["fraction_of_comparisons"] for bootstrap in range(n_bootstraps)])
    #print("for this b, we  get this avg g1 star ", b, avg_f1_star)

    # Store the averaged results for the current b
    test_final_results.append({
        "b": b,
        "avg_f1_measure": avg_f1_measure,
        "avg_f1_star": avg_f1_star,
        "avg_pair_quality": avg_pair_quality,
        "avg_pair_completeness": avg_pair_completeness,
        "fraction_of_comparisons": avg_fraction_of_comparisons,
    })


#print("test ", test_final_results)

# Up Main Test

# Graphs (Results latest test)

In [None]:
import matplotlib.pyplot as plt
# Convert results to a DataFrame for easier plotting
results_df = pd.DataFrame(test_final_results)

# Generate Graphs
plt.figure(figsize=(15, 10))

# Graph 1: Fraction of Comparisons vs Average F1 Measure
plt.subplot(2, 2, 1)
plt.plot(results_df['fraction_of_comparisons'], results_df['avg_f1_measure'], marker='o')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average F1 Measure')
plt.title('Fraction of Comparisons vs F1 Measure')

# Graph 2: Fraction of Comparisons vs Average F1* Measure
plt.subplot(2, 2, 2)
plt.plot(results_df['fraction_of_comparisons'], results_df['avg_f1_star'], marker='o')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average F1* Measure')
plt.title('Fraction of Comparisons vs F1* Measure')

# Graph 3: Fraction of Comparisons vs Average Pair Quality
plt.subplot(2, 2, 3)
plt.plot(results_df['fraction_of_comparisons'], results_df['avg_pair_quality'], marker='o')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average Pair Quality')
plt.title('Fraction of Comparisons vs Pair Quality')

# Graph 4: Fraction of Comparisons vs Average Pair Completeness
plt.subplot(2, 2, 4)
plt.plot(results_df['fraction_of_comparisons'], results_df['avg_pair_completeness'], marker='o')
plt.xlabel('Fraction of Comparisons')
plt.ylabel('Average Pair Completeness')
plt.title('Fraction of Comparisons vs Pair Completeness')

# # Graph 5: Fraction of Comparisons vs Average F1 Measure & F1* Measure
# plt.subplot(2, 3, 5)
# plt.plot(results_df['fraction_of_comparisons'], results_df['avg_f1_measure'], marker='o', label='F1 Measure')
# plt.plot(results_df['fraction_of_comparisons'], results_df['avg_f1_star'], marker='o', label='F1* Measure', linestyle='--')
# plt.xlabel('Fraction of Comparisons')
# plt.ylabel('F1 Measures')
# plt.title('Fraction of Comparisons vs F1 & F1* Measures')
#plt.legend()

# Adjust layout and show
plt.tight_layout()
plt.show()