# Import libraries 

In [2]:
import os 
import pickle
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.kernel_approximation import RBFSampler


warnings.filterwarnings("ignore")

# Set the pathes to the data folders

In [3]:
path_to_working_dir = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/'
path_to_data_folder = path_to_working_dir + 'data/'
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'
models_directory = path_to_working_dir + 'models/'
path_to_results_folder = path_to_data_folder + 'results/'
path_to_feature_vectors_folder = path_to_data_folder + 'feature_vectors/'
active_learning_path = path_to_data_folder + 'active_learning/'
path_to_models_folders = path_to_data_folder + 'models/'

# Initial Config

In [4]:
initial_config = {
    'data_duplicated': True, # Set to True if data duplication is enabled , False otherwise
    'meta_data_representation': 1, # Options: 1:generate_data_representation_initial_features(), 2:generate_data_representation_better_values,3:generate_data_representations_combined_features,4:generate_data_representations_mean_embeddings 
    'training_instance_based': True # Set to True for instance-based training, False otherwistraining_instance_based
} 

meta_data_rep_mapping = {
    1:'generate_data_representation_initial_features',
    2:'generate_data_representation_better_features',
    3:'generate_data_representations_combined_features',
    4:'generate_data_representations_mean_embeddings'
}


# Utils

In [5]:
# Calculate the Euclidean distance between rows
def calculate_distances(row,df, columns_to_consider):        
    distances = euclidean_distances([row[columns_to_consider]],df[columns_to_consider])
    return distances.flatten()
   

# Extract the closest 7 rows and calculate the ratio of 'is_match' values
def calcualte_ratio(row, df,columns_to_consider):
    #print("The following row has the value of 'is_match' {}".format(row['is_match']))
    distances = calculate_distances(row,df,columns_to_consider)
    closest_indices = np.argsort(distances)[1:6] # Exclude the row itself
    closest_rows = df.iloc[closest_indices]
    # Count the occurrences of the same 'is_match' label
    match_count = (closest_rows['is_match'] == row['is_match']).sum()
    
    # Calculate the ratio
    ratio = match_count / len(closest_rows)
                
    return ratio


def del_file(file_path):
    try:
        os.remove(file_path)
        print(f"File {file} deleted successfully.")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")
    


def delete_files_from_folder(folder_path,files_to_del='ALL'):  
    if files_to_del == 'ALL':
        # List all files in the folder
        files = os.listdir(folder_path)

        # Iterate through the files and delete them
        for file in files:
            file_path = os.path.join(folder_path, file)
            del_file(file_path)
    else:
        full_file_path = os.path.join(folder_path, files_to_del)
        del_file(full_file_path)
            
        


# The attributes of the feature vectors

feature vectors to extract:
    
1. Number of columns that have at least one value 

2. "Modell_no_list_truncatebegin20" (Prozent der non-Null Werte,Min,Mean,Max, Median)

3. "MPN_Liste_TruncateBegin20"  (prozent der non-Null Werte,Min,Mean,Max,Median)

4. "EAN_Liste_TruncateBegin20" (Pozent der non-Null Werte,Min,Mean,Max,Median)

5. "Produktname_dic3" (Prozent der non-Null Werte,Min,Mean,Max,median)

6. "Modell_Liste_3g" (Prozent der non-Null Werte,Min,Mean,Max,Median)

7. "Digital_zoom_NumMaxProz30" (Prozent der non-Null Werte,Min,Mean,Max,Median)

8. "optischer_zoom_NumMaxProz30" (Prozent der non-Null Werte,Min,Mean,Max,Median)

9. "Breite_NumMaxProz30" (Prozent der non-Null Werte,Min,Mean,Max,Median)

10. "Höhe_NumMaxProz30" (Prozent der non-Null Werte,Min,Mean,Max,Median)

11. "Gewicht_NumMaxProz30" (Prozent der non-Null Werte,Min,Mean,Max,Median)

12. "Sensortyp_Jaccard3" (Prozent der non-Null Werte,Min,Mean,Max,Median)


# Generate feature vectors

In [6]:
import math

def get_similairty_value(list1, list2):
    """
    Calculate normalized inverse Euclidean similarity between two lists.

    Parameters:
    - list1: First list
    - list2: Second list

    Returns:
    - Normalized inverse Euclidean similarity (a float between 0 and 1)
    """
    distances = []

    # Ensure the lists have the same length
    if len(list1) != len(list2):
        raise ValueError("Lists must have the same length")

    # Calculate Euclidean distance for each pair of values
    for value1, value2 in zip(list1, list2):
        distance = math.sqrt((value1 - value2) ** 2)
        distances.append(distance)

    # Check if all distances are zero (identical lists)
    if all(distance == 0 for distance in distances):
        return 1.0  # Identical lists, return maximum similarity

    # Normalize the distances and calculate the average
    max_distance = max(distances)
    normalized_distances = [1 - (distance / max_distance) for distance in distances]
    similarity_score = sum(normalized_distances) / len(normalized_distances)

    return similarity_score



In [None]:
def compute_mean_similarity(list1, list2):
    # Check if the lists have the same length
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length")

    # Calculate absolute differences
    abs_diff = [abs(a - b) for a, b in zip(list1, list2)]

    # Normalize and calculate the mean of absolute differences
    mean_normalized_diff = 1.0 - (sum(abs_diff) / len(list1))

    return mean_normalized_diff


similarity_score = compute_similarity(values1, values2)
print(f"Similarity Score: {similarity_score}")


In [None]:
def generate_data_representation_very_new_test():
    
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2

    # Columns to extracte from feature extraction 
    to_consider_columns = ['Produktname_dic3','Modell_Liste_3g']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        column_std_deviation = df.std().round(2)
        column_mean = df.mean().round(2)
       
        
    
    
        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        #non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,column_mean,column_std_deviation], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list)
    # Assign names to columns
    column_names = [f"feature_{i+1}" for i in range(df.shape[1])]
    df.columns = column_names

    return df,df['feature_1']
   






In [None]:
d , s = generate_data_representation_very_new_test()
d.head()

In [None]:
# Sort the DataFrame by the 'Age' column (descending order)
df_sorted_desc = d.sort_values(by=['feature_4','feature_5','feature_2','feature_3'], ascending=False)
df_sorted_desc.to_csv('testnew.csv')

In [None]:
d,d_1= generate_data_representation_very_new_test()
d.head()

In [None]:
d,t = generate_data_representation_very_new_test()
d.iloc[:, 1:]

In [None]:
def generate_data_representation_very_new():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_init_features.csv')
     
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['Produktname_dic3','Modell_Liste_3g']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_min = df.min().round(2)
        column_mean = df.mean().round(2)
        column_max = df.max().round(2)
        column_25th_percentile = df.quantile(0.25).round(2)
        column_median = df.quantile(0.5).round(2)
        column_75th_percentile = df.quantile(0.75).round(2)
        
    
    
        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,non_nan_columns_count_series,column_min, column_mean, column_max,column_25th_percentile,column_median,column_75th_percentile], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                                                       'Produktname_dic3_min', 'Modell_Liste_min','Produktname_dic3_mean',
                                                       'Modell_Liste_mean','Produktname_dic3_max', 'Modell_Liste_max','Produktname_dic3_25th_percentile', 
                                                       'Modell_Liste_25th_percentile','Produktname_median', 'Modell_Liste_median', 'Produktname_75th_percentile',
                                                       'Modell_Liste_75th_percentile'])
                                                       
    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                        'Produktname_dic3_min', 'Produktname_dic3_mean', 'Produktname_dic3_max','Produktname_dic3_25th_percentile','Produktname_median','Produktname_75th_percentile',
                        'Modell_Liste_min', 'Modell_Liste_mean', 'Modell_Liste_max','Modell_Liste_25th_percentile','Modell_Liste_median','Modell_Liste_75th_percentile'                                
                        ]

    df = df[columns_to_order]
    df.to_csv(os.path.join(path_to_feature_vectors_folder,'feature_vectors_init_features.csv'))

    return df,df['compared_resources']


In [None]:
df, df_1 = generate_data_representation_very_new()
df.shape

In [None]:
def generate_data_representation_initial_features():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_init_features.csv')
     
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_min = df.min().round(2)
        column_mean = df.mean().round(2)
        column_max = df.max().round(2)
        column_median = df.median().round(2)
    
    
        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,non_nan_columns_count_series,nan_percentage,column_min, column_mean, column_max, column_median], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                                                       'MPN_Liste_perc_nan', 'EAN_Liste_perc_nan', 'Produktname_dic3_perc_nan',
                                                       'Modell_Liste_perc_nan', 'Digital_zoom_perc_nan', 'optischer_zoom_perc_nan',
                                                       'Breite_perc_nan', 'Höhe_perc_nan', 'Gewicht_perc_nan','Sensortyp_perc_nan',
                                                       'MPN_Liste_min', 'EAN_Liste_min', 'Produktname_dic3_min', 'Modell_Liste_min', 
                                                       'Digital_zoom_min', 'optischer_zoom_min', 'Breite_min', 'Höhe_min',
                                                       'Gewicht_min', 'Sensortyp_min', 'MPN_Liste_mean', 'EAN_Liste_mean', 'Produktname_dic3_mean',
                                                       'Modell_Liste_mean', 'Digital_zoom_mean','optischer_zoom_mean', 'Breite_mean',
                                                       'Höhe_mean','Gewicht_mean','Sensortyp_mean', 'MPN_Liste_max', 'EAN_Liste_max',
                                                       'Produktname_dic3_max', 'Modell_Liste_max', 'Digital_zoom_max', 'optischer_zoom_max',
                                                       'Breite_max', 'Höhe_max', 'Gewicht_max', 'Sensortyp_max',
                                                       'MPN_Liste_median', 'EAN_Liste_median', 'Produktname_dic3_median', 'Modell_Liste_median',
                                                       'Digital_zoom_median', 'optischer_zoom_median','Breite_median','Höhe_median','Gewicht_median','Sensortyp_median'])


    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                        'MPN_Liste_perc_nan', 'MPN_Liste_min', 'MPN_Liste_mean', 'MPN_Liste_max','MPN_Liste_median',
                        'EAN_Liste_perc_nan', 'EAN_Liste_min', 'EAN_Liste_mean', 'EAN_Liste_max','EAN_Liste_median',
                        'Produktname_dic3_perc_nan', 'Produktname_dic3_min', 'Produktname_dic3_mean', 'Produktname_dic3_max', 'Produktname_dic3_median',
                        'Modell_Liste_perc_nan', 'Modell_Liste_min', 'Modell_Liste_mean', 'Modell_Liste_max', 'Modell_Liste_median',
                        'Digital_zoom_perc_nan', 'Digital_zoom_min', 'Digital_zoom_mean', 'Digital_zoom_max', 'Digital_zoom_median',
                        'optischer_zoom_perc_nan', 'optischer_zoom_min', 'optischer_zoom_mean', 'optischer_zoom_max','optischer_zoom_median',
                        'Breite_perc_nan', 'Breite_min', 'Breite_mean', 'Breite_max', 'Breite_median',
                        'Höhe_perc_nan', 'Höhe_min', 'Höhe_mean', 'Höhe_max', 'Höhe_median',
                        'Gewicht_perc_nan', 'Gewicht_min', 'Gewicht_mean', 'Gewicht_max', 'Gewicht_median',
                        'Sensortyp_perc_nan', 'Sensortyp_min', 'Sensortyp_mean', 'Sensortyp_max', 'Sensortyp_median'                                             
                        ]

    df = df[columns_to_order]
    df.to_csv(os.path.join(path_to_feature_vectors_folder,'feature_vectors_init_features.csv'))

    return df,df['compared_resources']





In [None]:
def generate_data_representation_initial_few_features():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    #delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_init_features.csv')
     
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_min = df.min().round(2)
        column_mean = df.mean().round(2)
        column_max = df.max().round(2)
        column_median = df.median().round(2)
    
    
        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,non_nan_columns_count_series,nan_percentage,column_min, column_mean, column_max, column_median], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                                                       'MPN_Liste_perc_nan', 'EAN_Liste_perc_nan', 'Produktname_dic3_perc_nan',
                                                       'MPN_Liste_min', 'EAN_Liste_min', 'Produktname_dic3_min', 
                                                       'MPN_Liste_mean', 'EAN_Liste_mean', 'Produktname_dic3_mean',
                                                       'MPN_Liste_max', 'EAN_Liste_max','Produktname_dic3_max', 
                                                       'MPN_Liste_median', 'EAN_Liste_median', 'Produktname_dic3_median'])


    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                        'MPN_Liste_perc_nan', 'MPN_Liste_min', 'MPN_Liste_mean', 'MPN_Liste_max','MPN_Liste_median',
                        'EAN_Liste_perc_nan', 'EAN_Liste_min', 'EAN_Liste_mean', 'EAN_Liste_max','EAN_Liste_median',
                        'Produktname_dic3_perc_nan', 'Produktname_dic3_min', 'Produktname_dic3_mean', 'Produktname_dic3_max', 'Produktname_dic3_median'                                           
                        ]

    df = df[columns_to_order]
    df.to_csv(os.path.join(path_to_feature_vectors_folder,'feature_vectors_init_features.csv'))

    return df,df['compared_resources']


In [None]:
def generate_data_representation_better_features():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_better_features.csv')
     
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]

    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        #non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_std_deviation = df.std().round(2)
        column_median = df.median().round(2)
        column_skewness = df.skew().round(2)
        column_kurtosis = df.kurt().round(2)


        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        #non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,nan_percentage,column_std_deviation, column_median, column_skewness, column_kurtosis], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())

    
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 
                                                       'MPN_Liste_perc_nan', 'EAN_Liste_perc_nan', 'Produktname_dic3_perc_nan',
                                                       'Modell_Liste_perc_nan', 'Digital_zoom_perc_nan', 'optischer_zoom_perc_nan',
                                                       'Breite_perc_nan', 'Höhe_perc_nan', 'Gewicht_perc_nan','Sensortyp_perc_nan',
                                                       'MPN_Liste_std_deviation', 'EAN_Liste_std_deviation', 'Produktname_dic3_std_deviation', 'Modell_Liste_std_deviation', 
                                                       'Digital_zoom_std_deviation', 'optischer_zoom_std_deviation', 'Breite_std_deviation', 'Höhe_std_deviation',
                                                       'Gewicht_std_deviation', 'Sensortyp_std_deviation', 'MPN_Liste_median', 'EAN_Liste_median', 'Produktname_dic3_median',
                                                       'Modell_Liste_median', 'Digital_zoom_median','optischer_zoom_median', 'Breite_median',
                                                       'Höhe_median','Gewicht_median','Sensortyp_median', 'MPN_Liste_skewness', 'EAN_Liste_skewness',
                                                       'Produktname_dic3_skewness', 'Modell_Liste_skewness', 'Digital_zoom_skewness', 'optischer_zoom_skewness',
                                                       'Breite_skewness', 'Höhe_skewness', 'Gewicht_skewness', 'Sensortyp_skewness',
                                                       'MPN_Liste_kurtosis', 'EAN_Liste_kurtosis', 'Produktname_dic3_kurtosis', 'Modell_Liste_kurtosis',
                                                       'Digital_zoom_kurtosis', 'optischer_zoom_kurtosis','Breite_kurtosis','Höhe_kurtosis','Gewicht_kurtosis','Sensortyp_kurtosis'])


    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources',
                        'MPN_Liste_perc_nan', 'MPN_Liste_std_deviation', 'MPN_Liste_median', 'MPN_Liste_skewness','MPN_Liste_kurtosis',
                        'EAN_Liste_perc_nan', 'EAN_Liste_std_deviation', 'EAN_Liste_median', 'EAN_Liste_skewness','EAN_Liste_kurtosis',
                        'Produktname_dic3_perc_nan', 'Produktname_dic3_std_deviation', 'Produktname_dic3_median', 'Produktname_dic3_skewness', 'Produktname_dic3_kurtosis',
                        'Modell_Liste_perc_nan', 'Modell_Liste_std_deviation', 'Modell_Liste_median', 'Modell_Liste_skewness', 'Modell_Liste_kurtosis',
                        'Digital_zoom_perc_nan', 'Digital_zoom_std_deviation', 'Modell_Liste_median', 'Digital_zoom_skewness', 'Digital_zoom_kurtosis',
                        'optischer_zoom_perc_nan', 'optischer_zoom_std_deviation', 'optischer_zoom_median', 'optischer_zoom_skewness','optischer_zoom_kurtosis',
                        'Breite_perc_nan', 'Breite_std_deviation', 'Breite_median', 'Breite_skewness', 'Breite_kurtosis',
                        'Höhe_perc_nan', 'Höhe_std_deviation', 'Höhe_median', 'Höhe_skewness', 'Höhe_kurtosis',
                        'Gewicht_perc_nan', 'Gewicht_std_deviation', 'Gewicht_median', 'Gewicht_skewness', 'Gewicht_kurtosis',
                        'Sensortyp_perc_nan', 'Sensortyp_std_deviation', 'Sensortyp_median', 'Sensortyp_skewness', 'Sensortyp_kurtosis']       



    df = df[columns_to_order]
    df.to_csv(os.path.join(path_to_feature_vectors_folder, 'feature_vectors_better_features.csv'))

    return df,df['compared_resources']




In [None]:
def generate_data_representations_combined_features():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_combined_features.csv')
    
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]
    
    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 99999 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_min = df.min().round(2)
        column_mean = df.mean().round(2)
        column_max = df.max().round(2)
        column_median = df.median().round(2)
        
        column_std_deviation = df.std().round(2)
        column_skewness = df.skew().round(2)
        column_kurtosis = df.kurt().round(2)

        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,non_nan_columns_count_series,nan_percentage,column_min, column_mean, column_max, column_median], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                                                       'MPN_Liste_perc_nan', 'EAN_Liste_perc_nan', 'Produktname_dic3_perc_nan',
                                                       'Modell_Liste_perc_nan', 'Digital_zoom_perc_nan', 'optischer_zoom_perc_nan',
                                                       'Breite_perc_nan', 'Höhe_perc_nan', 'Gewicht_perc_nan','Sensortyp_perc_nan',
                                                       'MPN_Liste_min', 'EAN_Liste_min', 'Produktname_dic3_min', 'Modell_Liste_min', 
                                                       'Digital_zoom_min', 'optischer_zoom_min', 'Breite_min', 'Höhe_min',
                                                       'Gewicht_min', 'Sensortyp_min', 'MPN_Liste_mean', 'EAN_Liste_mean', 'Produktname_dic3_mean',
                                                       'Modell_Liste_mean', 'Digital_zoom_mean','optischer_zoom_mean', 'Breite_mean',
                                                       'Höhe_mean','Gewicht_mean','Sensortyp_mean', 'MPN_Liste_max', 'EAN_Liste_max',
                                                       'Produktname_dic3_max', 'Modell_Liste_max', 'Digital_zoom_max', 'optischer_zoom_max',
                                                       'Breite_max', 'Höhe_max', 'Gewicht_max', 'Sensortyp_max',
                                                       'MPN_Liste_median', 'EAN_Liste_median', 'Produktname_dic3_median', 'Modell_Liste_median',
                                                       'Digital_zoom_median', 'optischer_zoom_median','Breite_median','Höhe_median','Gewicht_median','Sensortyp_median',
                                                       'MPN_Liste_std_deviation', 'EAN_Liste_std_deviation', 'Produktname_dic3_std_deviation', 
                                                       'Modell_Liste_std_deviation', 'Digital_zoom_std_deviation', 'optischer_zoom_std_deviation',
                                                       'Breite_std_deviation', 'Höhe_std_deviation', 'Gewicht_std_deviation', 
                                                       'Sensortyp_std_devaition','MPN_Liste_skewness', 'EAN_Liste_skewness', 
                                                       'Produktname_dic3_skewness', 'Modell_Liste_skewness', 'Digital_zoom_skewness',
                                                       'optischer_zoom_skewness', 'Breite_skewness', 'Höhe_skewness',
                                                       'Gewicht_skewness', 'Sensortyp_skewness','MPN_Liste_kurtosis',
                                                       'EAN_Liste_kurtosis', 'Produktname_dic3_std_kurtosis', 'Modell_Liste_kurtosis',
                                                       'Digital_zoom_kurtosis', 'optischer_zoom_kurtosis','Breite_kurtosis',
                                                       'Höhe_kurtosis', 'Gewicht_kurtosis', 'Sensortyp_kurtosis'])


    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                        'MPN_Liste_perc_nan', 'MPN_Liste_min', 'MPN_Liste_mean', 'MPN_Liste_max','MPN_Liste_median','MPN_Liste_std_deviation','MPN_Liste_skewness','MPN_Liste_kurtosis',
                        'EAN_Liste_perc_nan', 'EAN_Liste_min', 'EAN_Liste_mean', 'EAN_Liste_max','EAN_Liste_median','EAN_Liste_std_deviation','EAN_Liste_skewness','EAN_Liste_kurtosis',
                        'Produktname_dic3_perc_nan', 'Produktname_dic3_min', 'Produktname_dic3_mean', 'Produktname_dic3_max', 'Produktname_dic3_median','Produktname_dic3_std_deviation','Produktname_dic3_skewness','Produktname_dic3_std_kurtosis',
                        'Modell_Liste_perc_nan', 'Modell_Liste_min', 'Modell_Liste_mean', 'Modell_Liste_max', 'Modell_Liste_median','Modell_Liste_std_deviation','Modell_Liste_skewness','Modell_Liste_kurtosis',
                        'Digital_zoom_perc_nan', 'Digital_zoom_min', 'Digital_zoom_mean', 'Digital_zoom_max', 'Digital_zoom_median','Digital_zoom_std_deviation','Digital_zoom_skewness','Digital_zoom_kurtosis',
                        'optischer_zoom_perc_nan', 'optischer_zoom_min', 'optischer_zoom_mean', 'optischer_zoom_max','optischer_zoom_median','optischer_zoom_std_deviation','optischer_zoom_skewness','optischer_zoom_kurtosis',
                        'Breite_perc_nan', 'Breite_min', 'Breite_mean', 'Breite_max', 'Breite_median','Breite_std_deviation','Breite_skewness','Breite_kurtosis',
                        'Höhe_perc_nan', 'Höhe_min', 'Höhe_mean', 'Höhe_max', 'Höhe_median', 'Höhe_std_deviation', 'Höhe_skewness', 'Höhe_kurtosis'
                        'Gewicht_perc_nan', 'Gewicht_min', 'Gewicht_mean', 'Gewicht_max', 'Gewicht_median','Gewicht_std_deviation', 'Gewicht_skewness','Gewicht_kurtosis'
                        'Sensortyp_perc_nan', 'Sensortyp_min', 'Sensortyp_mean', 'Sensortyp_max', 'Sensortyp_median', 'Sensortyp_std_devaition','Sensortyp_skewness','Sensortyp_kurtosis'                                            
                        ]

    df = df[columns_to_order]
    df.to_csv(os.path.join(path_to_feature_vectors_folder,'feature_vectors_combined_features.csv'))

    return df,df['compared_resources']
        
    

In [None]:
# Get a list of all CSV files in the folder 
csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]
    
# Create an empty list to store feature vectors
feature_vectors_list = []

# Value to exclude from feature extraction
nan_replacement_value = 2 

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


for csv_file in csv_files:
    file_path = os.path.join(path_to_sim_vector_folder, csv_file)
    df = pd.read_csv(file_path)
    df = df[to_consider_columns]
    # convert "/" into Nan
    df = df.apply(pd.to_numeric, errors= 'coerce')
    # Replace NaN values in the feature vector with a specific numerical value
    df.fillna(nan_replacement_value, inplace=True)
    
    # Standardize the data (important for PCA)
    scaler = StandardScaler()
    df_standardized = scaler.fit_transform(df)
    
    # Apply PCA
    pca = PCA()
    df_pca = pca.fit_transform(df_standardized)

    # Create a DataFrame with principal components
    df_pca_result = pd.DataFrame(data=df_pca, columns=['PC1', 'PC2', 'PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10'])

    # Display the DataFrame with principal components
    print(df_pca_result)

    # Explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_
    print("Explained Variance Ratio:", explained_variance_ratio)
    
    
    # Plot explained variance
    plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.show()
    
    





In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Sample DataFrame
data = {'Feature1': [1, 2, 3, 4, 5],
        'Feature2': [5, 4, 3, 2, 1],
        'Feature3': [2, 3, 1, 4, 5]}
df = pd.DataFrame(data)

# Standardize the data (important for PCA)
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)

# Apply PCA
pca = PCA()
df_pca = pca.fit_transform(df_standardized)

# Create a DataFrame with principal components
df_pca_result = pd.DataFrame(data=df_pca, columns=['PC1', 'PC2', 'PC3'])

# Display the DataFrame with principal components
print(df_pca_result)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Plot explained variance
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.show()


In [None]:
# Example usage
# Generate two synthetic datasets
np.random.seed(42)
data_x = np.random.randn(100, 2)  # Samples from the first distribution
data_y = np.random.randn(100, 2) + 2  # Samples from the second distribution (shifted)

# Perform MMD test
mmd_test = MMD_2_Sample_Test(data_x, data_y)
result = mmd_test.perform_test()

# Display the test result
print("MMD Statistic:", result['testStat'])
print("P-value:", result['pValue'])


In [None]:
def generate_data_representations_mean_embeddings():
    
    # Checks if the feature vector file already exists and if it exits , then remove it
    delete_files_from_folder(path_to_feature_vectors_folder,'feature_vectors_mean_embeddings_features.csv')
    
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]
    
    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 2 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')
        # Replace NaN values in the feature vector with a specific numerical value
        df.fillna(nan_replacement_value, inplace=True)
        df = df.values
        rbf_feature = RBFSampler(gamma=1, random_state=1)
        df_rbf = rbf_feature.fit_transform(df)
        # Calculate MMD
        mmd = df_rbf.mean(axis=0)      
        # Convert the mmd np-array into a list
        mmd = mmd.tolist()
        # Insert the compared_resource name at the first index of the list 
        mmd.insert(0, csv_file)
        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(mmd)
        
        
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list)
    # Assign names to columns
    column_names = [f"feature_{i+1}" for i in range(101)]
    df.columns = column_names
    df.to_csv(os.path.join(path_to_feature_vectors_folder,'feature_vectors_mean_embeddings_features.csv'))
    print(df.head())
    return df,df['feature_1']
    

In [None]:
def test():
    # specify the path of the data folder 
    path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

    # Specify the folder path containing the csv files 
    path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'

    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    # Create an empty list to store feature vectors
    feature_vectors_list = []

    # Value to exclude from feature extraction
    nan_replacement_value = 10 

    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


    # Iterate through each CSV file and generate statistics
    for csv_file in csv_files:
        file_path = os.path.join(path_to_sim_vector_folder, csv_file)
        df = pd.read_csv(file_path)
        df = df[to_consider_columns]
        # convert "/" into Nan
        df = df.apply(pd.to_numeric, errors= 'coerce')


        # Calculate statistics
        # The number of columns that have at least one non-Nan Value
        non_nan_columns_count = np.sum(df.notnull().any())
        # The percentage of the Nan values in each column 
        nan_percentage = ((df.isnull().sum() / len(df)) * 100).round(2)
        column_min = df.min().round(2)
        column_mean = df.mean().round(2)
        column_max = df.max().round(2)
        column_median = df.median().round(2)


        # Create a Series for the names of the sim-vectors
        file_name_series = pd.Series([csv_file], index=['file_name'])

        # Create a Series for non_nan_columns_count
        non_nan_columns_count_series = pd.Series([non_nan_columns_count], index=['non_nan_columns_count'])

        # Concatenate statistics into a feature vector
        feature_vector = pd.concat([file_name_series,non_nan_columns_count_series,nan_percentage,column_min, column_mean, column_max, column_median], axis=0)

        # Replace NaN values in the feature vector with a specific numerical value
        feature_vector.fillna(nan_replacement_value, inplace=True)

        # Convert the feature vector to a list and append it to the feature vectors list
        feature_vectors_list.append(feature_vector.tolist())
    
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(feature_vectors_list, columns = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                                                      'MPN_Liste_perc_nan', 'EAN_Liste_perc_nan', 'Produktname_dic3_perc_nan',
                                                      'Modell_Liste_perc_nan', 'Digital_zoom_perc_nan', 'optischer_zoom_perc_nan',
                                                      'Breite_perc_nan', 'Höhe_perc_nan', 'Gewicht_perc_nan','Sensortyp_perc_nan',
                                                      'MPN_Liste_min', 'EAN_Liste_min', 'Produktname_dic3_min', 'Modell_Liste_min', 
                                                      'Digital_zoom_min', 'optischer_zoom_min', 'Breite_min', 'Höhe_min',
                                                      'Gewicht_min', 'Sensortyp_min', 'MPN_Liste_mean', 'EAN_Liste_mean', 'Produktname_dic3_mean',
                                                      'Modell_Liste_mean', 'Digital_zoom_mean','optischer_zoom_mean', 'Breite_mean',
                                                      'Höhe_mean','Gewicht_mean','Sensortyp_mean', 'MPN_Liste_max', 'EAN_Liste_max',
                                                      'Produktname_dic3_max', 'Modell_Liste_max', 'Digital_zoom_max', 'optischer_zoom_max',
                                                      'Breite_max', 'Höhe_max', 'Gewicht_max', 'Sensortyp_max',
                                                      'MPN_Liste_median', 'EAN_Liste_median', 'Produktname_dic3_median', 'Modell_Liste_median',
                                                      'Digital_zoom_median', 'optischer_zoom_median','Breite_median','Höhe_median','Gewicht_median','Sensortyp_median'])


    # Order the columns in the dataframe according to the following order
    columns_to_order = ['compared_resources', 'Numer_of_columns_with_at_least_one_values', 
                        'MPN_Liste_perc_nan', 'MPN_Liste_min', 'MPN_Liste_mean', 'MPN_Liste_max','MPN_Liste_median',
                        'EAN_Liste_perc_nan', 'EAN_Liste_min', 'EAN_Liste_mean', 'EAN_Liste_max','EAN_Liste_median',
                        'Produktname_dic3_perc_nan', 'Produktname_dic3_min', 'Produktname_dic3_mean', 'Produktname_dic3_max', 'Produktname_dic3_median',
                        'Modell_Liste_perc_nan', 'Modell_Liste_min', 'Modell_Liste_mean', 'Modell_Liste_max', 'Modell_Liste_median',
                        'Digital_zoom_perc_nan', 'Digital_zoom_min', 'Digital_zoom_mean', 'Digital_zoom_max', 'Digital_zoom_median',
                        'optischer_zoom_perc_nan', 'optischer_zoom_min', 'optischer_zoom_mean', 'optischer_zoom_max','optischer_zoom_median',
                        'Breite_perc_nan', 'Breite_min', 'Breite_mean', 'Breite_max', 'Breite_median',
                        'Höhe_perc_nan', 'Höhe_min', 'Höhe_mean', 'Höhe_max', 'Höhe_median',
                        'Gewicht_perc_nan', 'Gewicht_min', 'Gewicht_mean', 'Gewicht_max', 'Gewicht_median',
                        'Sensortyp_perc_nan', 'Sensortyp_min', 'Sensortyp_mean', 'Sensortyp_max', 'Sensortyp_median'                                             
                       ]

    df = df[columns_to_order]


    # Multiple each Feature by its Weight
    weights = [0,4,2,2,2,2,2,2,2,2,2,2,9,9,9,9,9,7,7,7,7,7,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]

    # Multiply each feature by its weight
    df = df * weights

    df.to_csv(path_to_data_folder + 'feature_vectors/' + 'feature_vectors.csv')

    return df,df['compared_resources']


In [None]:
def generate_meta_data_representations():
    if initial_config['meta_data_representation'] == 1:
         return generate_data_representation_initial_features()
    elif initial_config['meta_data_representation'] == 2:
         return generate_data_representation_better_features()
    elif initial_config['meta_data_representation'] == 3:
         return generate_data_representations_combined_features()
    elif initial_config['meta_data_representation'] == 4:
         return generate_data_representations_mean_embeddings()        
    

# Clustering

# Selecting the number of clusters

### 1) Silhouette Score 
the silhouette score is a metric used to calculate the goodness of a clusteringt algorithum. 
it measures how similair an object to its own cluster(cohesion) compared to other clusters(separation). The silhouette score ranges from -1 to 1, where a high value indicates that the object is well-matched to its own cluster and poorly matches to neighboring clusters. if most objects have a high vlaue , then the clustering configuration is approprite . if many points have a low or negatove value, then the clustering configuration may have too many or too few clusters

In [None]:
data , sources = generate_data_representations_mean_embeddings()

# Initialize the StandardScaler
scaler = StandardScaler()


In [None]:
def get_best_number_of_clusters_using_silhouette_score(data):
    silh_scores = []
    data_array = data

    list_k = list(range(2,40))

    for k in list_k:
        kmeans = KMeans(n_clusters=k)
        clusters = kmeans.fit_predict(data_array)
        silhouette_avg = silhouette_score(data_array, clusters)
        silh_scores.append(silhouette_avg)

    # Get the index of the highest number
    index_of_highest_number = silh_scores.index(max(silh_scores)) + 1
    highest_number = max(silh_scores)
    print("The best number of clusters is {} with an average silhouette score of {}".format(index_of_highest_number,highest_number))
    
    # Plot silh_scores against k 
    plt.figure(figsize=(6,10))
    plt.plot(list_k, silh_scores, '-o')
    plt.xlabel(r'Number of clusters *k*')
    plt.ylabel('Avg Silhouette Scores')
    
    return index_of_highest_number, highest_number


data , sources = generate_data_representation_very_new_test()

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
X_standardized = scaler.fit_transform(data.iloc[:,1:])
 
get_best_number_of_clusters_using_silhouette_score(X_standardized)



    

In [None]:
data , sources = generate_data_representation_very_new_test()

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
X_standardized = scaler.fit_transform(data.iloc[:,1:])
X_standardized

### 2) WCSS stand for Within-CLuster Sum of sqaures
It is a metric used to evaluate the performacne of a clustering algorithum , such as k-means. 
In the context of k-menas clustering, WCSS represnets the sum of squared distances between each 
data point in a cluster and the centroid of that cluster


In [None]:
def get_best_number_of_clusters_using_wcss_score(data):
    wcss = []
    data_array = data.iloc[:, 1:].values
    for k in range(1, 20):  # You can adjust the range of k as needed
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data_array)
        wcss.append(kmeans.inertia_)
    
    # Get the index of the highest number
    index_of_highest_number = wcss.index(min(wcss)) + 1
    highest_number = min(wcss)
    print("The best number of clusters is {} with an wcss score of {}".format(index_of_highest_number,highest_number))

    plt.figure(figsize=(8, 6))
    plt.plot(range(1, 20), wcss, marker='o', linestyle='--')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.title('Elbow Method')
    plt.show()
    
    return index_of_highest_number,highest_number 

In [None]:
data , sources = generate_data_representation_very_new_test()
# Fit and transform the data
X_standardized = scaler.fit_transform(data.iloc[:,1:])
get_best_number_of_clusters_using_wcss_score(data)

# Predict the clusters for each data point

In [None]:
def predict_clusters(df,num_of_clusters,feature_vectors_list):
    # Number of clusters
    n_clusters = num_of_clusters

    # Run KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(df.iloc[:, 1:].values)


    # Organize strings based on their cluster assignments
    cluster_strings = {}
    print(kmeans.labels_)
    for i, cluster in enumerate(kmeans.labels_):
        string = feature_vectors_list[i]
        if cluster not in cluster_strings:
            cluster_strings[cluster] = []
        cluster_strings[cluster].append(string)


    
    # Print cluster number and strings belonging to each other
    for cluster, strings in cluster_strings.items():
        print(f"Cluster Number: {cluster}")
        print(f"Elements: {', '.join(strings)}")
        print()
        
    clusters = kmeans.labels_

    # Add the cluster assignments to the DataFrame
    df['Cluster'] = clusters

    # Count the occurrences of each cluster 
    cluster_counts = np.bincount(clusters)

    # Print the cluster counts 
    print("\nNumber of elements in each cluster:")
    for cluster_num, count in enumerate(cluster_counts):
        print(f"Cluster {cluster_num}: {count} elements")
        
    return cluster_strings

# Modelling

# 1. Create XGBoost models trained on the largest data source in each cluster and save them in the models folder

In [None]:
def generate_models(cluster_strings):
    # Remvoe already created models
    delete_files_from_folder(path_to_models_folders)
    cluster_max_row_file = {}
    # 1. Extract the file with the most number of rows 
    for cluster,files in cluster_strings.items():
        # Initialize variable to keep track of the file with the most rows 
        max_rows = 0
        max_rows_file = ""

        # Iterate through the CSV files and find the one with the most rows
        for file in files:
            try:
                # Read the CSV_file
                df = pd.read_csv(path_to_sim_vector_folder + file)

                # Get the number of rows in the dataframe
                num_rows = len(df)

                # compare the number of rows with the current maximum
                if num_rows > max_rows:
                    max_rows = num_rows
                    max_rows_file = file


            except Exception as e:
                print(f"Error reading {file}: {e}")

        print("cluster")
        print(cluster)
        print("max_file")
        print(max_rows_file)
    
        # 2. Build an XGBoost model and train it on the data of the dataframe with the most number of rows 
        sim_vector_file_with_highest_number = pd.read_csv(path_to_sim_vector_folder + max_rows_file)

        print("shape")
        print(sim_vector_file_with_highest_number.shape)
        print("count values is:")
        print(sim_vector_file_with_highest_number['is_match'].value_counts())
        # Drop the specified columns using the drop() function
        sim_vector_file_with_highest_number.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)

        # Replace "/" with 9999 in 'is_match' column
        sim_vector_file_with_highest_number.replace('/', 9999, inplace=True)

        # Convert all columns to numerical data types 
        sim_vector_file_with_highest_number = sim_vector_file_with_highest_number.apply(pd.to_numeric, errors='coerce')

        if initial_config['training_instance_based'] == True:  
            sim_vector_file_with_highest_number['Produktname_dic3'] = sim_vector_file_with_highest_number['Produktname_dic3'].round(2)
            sim_vector_file_with_highest_number['Modell_Liste_3g'] = sim_vector_file_with_highest_number['Modell_Liste_3g'].round(2)
            columns_to_select = [col for col in sim_vector_file_with_highest_number.columns if col != 'is_match'] 
            sim_vector_file_with_highest_number['closest_match_ratio'] = sim_vector_file_with_highest_number.apply(lambda row: calcualte_ratio(row,sim_vector_file_with_highest_number,columns_to_select), axis=1)
            sim_vector_file_with_highest_number = sim_vector_file_with_highest_number.loc[sim_vector_file_with_highest_number['closest_match_ratio'] > 0.5]
            sim_vector_file_with_highest_number.drop(columns=['closest_match_ratio'], axis=1, inplace=True)


        # Assuming the last column is the target variable and the rest are features
        X = sim_vector_file_with_highest_number.iloc[:, :-1] # Features (all columns except the last one)
        y = sim_vector_file_with_highest_number.iloc[: , -1] # Taregt variable (is_match)

        # Create an XGBoost classifier 
        model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

        # Train the model in the taining data
        model.fit(X, y)

        # Specify the full path for saving the model 
        #model_path = path_to_modles_folder + 'Cluster_' + str(cluster) + '.model'

        # Save the XGBoost model to the specified path
        #model.save_model(model_path)
        file_name = path_to_models_folders + "Cluster_" +str(cluster) + '.pkl'

        # save
        pickle.dump(model, open(file_name, "wb"))

        cluster_max_row_file['Cluster_'+str(cluster)+'.pkl'] = max_rows_file

    

# Train each sim_vec_file by the clusters model and evaluate the performance

In [None]:
def models_training(cluster_strings):
    # List all files in the directory
    model_files = [f for f in os.listdir(path_to_models_folders) if f.endswith('.pkl')]

    # Load each XGBoost model 
    loaded_models = {}
    sim_vec_file_to_process = []
    trained_models = []
    performance = [] 

    for model_file in model_files:
        # Construct the full path for the model
        model_path = os.path.join(path_to_models_folders, model_file)
        # Load the XGBoost model
        xgb_model_loaded = pickle.load(open(model_path,"rb"))

        # Append the loaded model to the list associated with the model name
        if model_file not in loaded_models:
            loaded_models[model_file] = []
        loaded_models[model_file].append(xgb_model_loaded)
    
    
    # Get a list of all CSV files in the folder 
    csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]


    for file_name in csv_files:
       print("The file to process is {}".format(file_name))
       # find the first key associated with the specified value 
       first_matching_key = [key for key,value in cluster_strings.items() if file_name in value]
       sim_cluster_name = ('Cluster_'+str(first_matching_key)+'.pkl').replace('[','').replace(']','')
       print("The cluster name is {}".format(sim_cluster_name))
       full_path = os.path.join(path_to_sim_vector_folder, file_name)
       sim_vec_file = pd.read_csv(full_path)
       sim_vec_file.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)
       # Replace "/" with 9999 in 'is_match' column
       sim_vec_file.replace('/', 9999, inplace=True)
       # Convert all columns to numerical data types
       sim_vec_file = sim_vec_file.apply(pd.to_numeric, errors='coerce')
       X_test = sim_vec_file.iloc[:, :-1] # Features (all columns except the last one)
       y_test = sim_vec_file.iloc[: , -1] # Taregt variable (is_match)


       # Print cluster number and strings belonging to each other
       for model_name, model in loaded_models.items():
           if model_name == sim_cluster_name:
              sim_vector_file_model = pd.read_csv(path_to_sim_vector_folder + cluster_max_row_file.get(model_name))
              sim_vec_file_to_process.append(file_name)
              trained_models.append("Original cluster model " + cluster_max_row_file.get(model_name)) 
              predictions = model[0].predict(X_test)
              class_probs = model[0].predict_proba(X_test)
              sim_vec_file['pred'] = predictions
              sim_vec_file[['probabilties_0', 'probabilties_1']] = class_probs
              accuracy = accuracy_score(y_test, predictions)
              performance.append(accuracy)
              sim_vec_file.to_csv(active_learning_path + file_name + model_name + ".csv") 


           else:
              predictions = model[0].predict(X_test)
              class_probs = model[0].predict_proba(X_test)
              sim_vec_file['pred'] = predictions
              sim_vec_file[['probabilties_0', 'probabilties_1']] = class_probs
              accuracy = accuracy_score(y_test, predictions)
              sim_vector_file_model = pd.read_csv(path_to_sim_vector_folder + cluster_max_row_file.get(model_name))
              sim_vec_file_to_process.append(file_name)
              trained_models.append(cluster_max_row_file.get(model_name))
              performance.append(accuracy)
              #sim_vec_file.to_csv(active_learning_path + file_name + model_name + ".csv") 




    # Create a DataFrame
    data = {'sim_vec_file':sim_vec_file_to_process, 'model':trained_models, 'performance': performance}

    df = pd.DataFrame(data)
    print(df.head())
    
    file_name = ""
    
    if initial_config['data_duplicated']:
        file_name = "with_duplicated_data_"
    else: 
        file_name = "without_duplicated_data_"
        
    file_name = file_name + meta_data_rep_mapping[initial_config['meta_data_representation']] + "_"
     
    if initial_config['training_instance_based']:
        file_name = file_name + 'instance_based.csv'
    else:
        file_name = file_name + 'not_instance_based.csv'
    
    df.to_csv(os.path.join(path_to_results_folder + file_name))
    
    
    


In [None]:
def performance_evaluation(): 
    # Get a list of all CSV files in the folder 
    results_csv_files = [file for file in os.listdir(path_to_results_folder) if file.endswith('.csv')]
    for result_csv_file in results_csv_files:
        result_file_path = os.path.join(path_to_results_folder, result_csv_file)
        result_df = pd.read_csv(result_file_path)
        print("The configuration is: ")
        print(result_csv_file)
        print("The mean performance is: ")
        print(result_df['performance'].mean())

#performance_evaluation()

In [None]:
meta_data_rep,compared_resources = generate_data_representation_very_new()

# specify the path of the data folder 
path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

# Specify the folder path containing the csv files 
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'

# Get a list of all CSV files in the folder 
csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]

# Columns to extracte from feature extraction 
to_consider_columns = ['Produktname_dic3','Modell_Liste_3g']


first_file = []
second_file = []
similiarity_value = []

for index_1 , row_1 in meta_data_rep.iterrows():
    print(f"First dataset is {csv_file_1} ")
    file_path_1 = os.path.join(path_to_sim_vector_folder, row_1['compared_resources'])
    file_df_1 = pd.read_csv(file_path_1)
    file_df_1 = file_df_1[to_consider_columns]
    file_df_1 = file_df_1.apply(pd.to_numeric, errors = 'coerce')
    file_df_1.fillna(2,inplace=True)
    
    for index_2 , row_2 in meta_data_rep.iterrows():
        file_path_2 = os.path.join(path_to_sim_vector_folder, row_2['compared_resources'])
        if file_path_1 != file_path_2:
            print(f"second dataset is {csv_file_2}")
            first_file.append(row_1['compared_resources'])
            second_file.append(row_2['compared_resources'])
            file_df_2 = pd.read_csv(file_path_2)
            file_df_2 = file_df_2[to_consider_columns]
            file_df_2 = file_df_2.apply(pd.to_numeric, errors = 'coerce')
            file_df_2.fillna(2,inplace=True)
            similiarity_value.append(get_similairty_value(row_1.iloc[1:].tolist(),row_2.iloc[1:].tolist()))
            
            
            
df = pd.DataFrame({'first_file': first_file, 'second_file': second_file,'similiar':similiarity_value})

df.head()


#get_similairty_value




In [None]:
df.loc[df['similiar']>0.8]

In [None]:
idxmax_values = df.groupby(['first_file'])['similiar'].idxmax()
result_updated = df.loc[idxmax_values]
result_updated

In [None]:
n = result_updated.groupby(['second_file']).count().reset_index()
n.sort_values(by='similiar',ascending=False)

In [None]:
result_updated.loc[(result_updated['second_file']=='www.buzzillions.com_cammarkt.com.csv')]['first_file'].tolist()

In [None]:
result_updated.loc[(result_updated['second_file']=='www.garricks.com.au_cammarkt.com.csv')]

In [None]:
result_updated.loc[(result_updated['second_file']=='www.gosale.com_www.eglobalcentral.co.uk.csv')]

In [None]:
sources = ['cammarkt.com_www.pcconnection.com.csv',
           'www.wexphotographic.com_www.pcconnection.com.csv',
           'www.eglobalcentral.co.uk_www.pcconnection.com.csv',
           'www.eglobalcentral.co.uk_cammarkt.com.csv',
           'www.wexphotographic.com_www.eglobalcentral.co.uk.csv',
           'buy.net_www.wexphotographic.com.csv']


# specify the path of the data folder 
path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

# Specify the folder path containing the csv files 
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'

# Get a list of all CSV files in the folder 
csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]

# Columns to extracte from feature extraction 
to_consider_columns = ['Produktname_dic3','Modell_Liste_3g']


first_file = []
second_file = []
ks_statistc_produkt_name = []
ks_statistic_model_list = []
similiar = []

for csv_file_1 in sources:
    print(f"First dataset is {csv_file_1} ")
    file_path_1 = os.path.join(path_to_sim_vector_folder, csv_file_1)
    file_df_1 = pd.read_csv(file_path_1)
    file_df_1 = file_df_1[to_consider_columns]
    file_df_1 = file_df_1.apply(pd.to_numeric, errors = 'coerce')
    #file_df_1.fillna(2,inplace=True)
    
    for csv_file_2 in sources:
        file_path_2 = os.path.join(path_to_sim_vector_folder, csv_file_2)
        if file_path_1 != file_path_2:
            print(f"second dataset is {csv_file_2}")
            first_file.append(csv_file_1)
            second_file.append(csv_file_2)
            file_df_2 = pd.read_csv(file_path_2)
            file_df_2 = file_df_2[to_consider_columns]
            file_df_2 = file_df_2.apply(pd.to_numeric, errors = 'coerce')
            #file_df_2.fillna(2,inplace=True)
            
            ks_statistic_produktname, ks_p_value_produktname = ks_2samp(file_df_1['Produktname_dic3'], file_df_2['Produktname_dic3'])
            ks_statistic_modell_list, ks_p_value_modell_list = ks_2samp(file_df_1['Modell_Liste_3g'], file_df_2['Modell_Liste_3g'])
             
            values = [ks_p_value_produktname,ks_p_value_modell_list]
            ks_statistc_produkt_name.append(ks_statistic_produktname)
            ks_statistic_model_list.append(ks_statistic_modell_list)
            
            alpha = 0.05/2
            
            # Check if any value is less than the threshold
            any_value_below_threshold = any(value < alpha for value in values)

            # Print the result
            if any_value_below_threshold:
                similiar.append(0)
            else:
                similiar.append(1)

            
            
df = pd.DataFrame({'first_file': first_file, 'second_file': second_file,'ks_statistc_produkt_name':ks_statistc_produkt_name, 'ks_statistic_model_list':ks_statistic_model_list ,'similiar':similiar})







In [None]:
first_df = result_updated.loc[(result_updated['second_file']=='www.buzzillions.com_cammarkt.com.csv')]
second_df = result_updated.loc[(result_updated['second_file']=='www.garricks.com.au_cammarkt.com.csv')]


# Extracting the unique values from the 'ID' column in both DataFrames
intersection_values = pd.Series(list(set(first_df['first_file']).intersection(second_df['first_file'])))
print(len(intersection_values))



In [None]:
d = result_updated.loc[(result_updated['first_file']=='www.wexphotographic.com_www.pcconnection.com.csv')]

d.sort_values(by='similiar',ascending=False)


# www.eglobalcentral.co.uk_www.priceme.co.nz.csv ==> buy.net_www.priceme.co.nz.csv ==> www.gosale.com_www.priceme.co.nz.csv
# www.gosale.com_www.eglobalcentral.co.uk.csv 


#buy.net_cammarkt.com.csv ==> www.garricks.com.au_cammarkt.com.csv ==> www.garricks.com.au_www.pcconnection.com.csv
 

# www.camerafarm.com.au_www.priceme.co.nz.csv ==> buy.net_www.camerafarm.com.au.csv ==> www.wexphotographic.com_www.henrys.com.csv
# www.camerafarm.com.au_www.henrys.com.csv 

# buy.net_www.camerafarm.com.au.csv ==> www.wexphotographic.com_www.henrys.com.csv ==> www.camerafarm.com.au_www.henrys.com.csv
#  


# 




In [None]:
df_lead = pd.read_csv(os.path.join(path_to_sim_vector_folder, 'buy.net_cammarkt.com.csv'))

# Value to exclude from feature extraction
nan_replacement_value = 2 

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


df_lead = df_lead[to_consider_columns]
# convert "/" into Nan
df_lead = df_lead.apply(pd.to_numeric, errors= 'coerce')
col = ['Produktname_dic3', 'Modell_Liste_3g']
# Replace NaN values in the feature vector with a specific numerical value
df_lead.fillna(nan_replacement_value, inplace=True)

df_lead = df_lead[col]



df_test = pd.read_csv(os.path.join(path_to_sim_vector_folder, 'www.garricks.com.au_cammarkt.com.csv'))

# Value to exclude from feature extraction
nan_replacement_value = 2 

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


df_test = df_test[to_consider_columns]
# convert "/" into Nan
df_test = df_test.apply(pd.to_numeric, errors= 'coerce')
col = ['Produktname_dic3', 'Modell_Liste_3g']
# Replace NaN values in the feature vector with a specific numerical value
df_test.fillna(nan_replacement_value, inplace=True)

df_test = df_test[col]








In [None]:
df_lead.describe()

In [None]:
df_test.describe()

In [None]:
meta_data_rep,compared_resources = generate_data_representation_very_new()
meta_data_rep

In [None]:
def main():
    # Meta Data representations generation(There are methods: generate_data_representation() ,generate_data_representation_better_values() and  generate_data_representations_all_values()
    meta_data_rep,compared_resources = generate_data_representation_very_new()
    # Clustering of Meta Data representations
    number_of_clusters, avg_silhouette_score = get_best_number_of_clusters_using_wcss_score(meta_data_rep)
    # Predict clusters for each data point
    clusters = predict_clusters(meta_data_rep,number_of_clusters,compared_resources)
    return clusters
    # Modling(Create xgboost models for each cluster)
    #generate_models(clusters)
    # Predicting
    #models_training(clusters)
    
    
    
clusters =  main()  





In [None]:
from scipy.stats import ks_2samp
# specify the path of the data folder 
path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

# Specify the folder path containing the csv files 
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'

# Columns to extracte from feature extraction 
to_consider_columns = ['Produktname_dic3','Modell_Liste_3g']


cluster = []
first_file = []
second_file = []
ks_statistc_produkt_name = []
ks_statistic_model_list = []
similiar = []
cluster_count = []

for key, value in clusters.items():
    sources = value
    for csv_file_1 in sources:
        file_path_1 = os.path.join(path_to_sim_vector_folder, csv_file_1)
        file_df_1 = pd.read_csv(file_path_1)
        file_df_1 = file_df_1[to_consider_columns]
        file_df_1 = file_df_1.apply(pd.to_numeric, errors = 'coerce')
        #file_df_1.fillna(2,inplace=True)
    
        for csv_file_2 in sources:
            file_path_2 = os.path.join(path_to_sim_vector_folder, csv_file_2)
            if file_path_1 != file_path_2:
                cluster.append(key)
                first_file.append(csv_file_1)
                second_file.append(csv_file_2)
                cluster_count.append(len(value))
                file_df_2 = pd.read_csv(file_path_2)
                file_df_2 = file_df_2[to_consider_columns]
                file_df_2 = file_df_2.apply(pd.to_numeric, errors = 'coerce')
                #file_df_2.fillna(2,inplace=True)

                ks_statistic_produktname, ks_p_value_produktname = ks_2samp(file_df_1['Produktname_dic3'], file_df_2['Produktname_dic3'])
                ks_statistic_modell_list, ks_p_value_modell_list = ks_2samp(file_df_1['Modell_Liste_3g'], file_df_2['Modell_Liste_3g'])

                values = [ks_p_value_produktname,ks_p_value_modell_list]
                ks_statistc_produkt_name.append(ks_statistic_produktname)
                ks_statistic_model_list.append(ks_statistic_modell_list)

                alpha = 0.05/2

                # Check if any value is less than the threshold
                any_value_below_threshold = any(value < alpha for value in values)

                # Print the result
                if any_value_below_threshold:
                    similiar.append(0)
                else:
                    similiar.append(1)

            
            
df = pd.DataFrame({'cluster':cluster,'first_file': first_file, 'second_file': second_file,'ks_statistc_produkt_name':ks_statistc_produkt_name, 'ks_statistic_model_list':ks_statistic_model_list ,'similiar':similiar,'cluster_count':cluster_count})







In [None]:
df

In [None]:
df_updated = df.groupby(['cluster', 'first_file','cluster_count'])['similiar'].sum().reset_index()
df_updated['percentage'] = df_updated['similiar'] / df_updated['cluster_count']
idxmax_values = df_updated.groupby(['cluster','first_file'])['percentage'].idxmax()
# Use the index to get the corresponding rows
result = df_updated.loc[idxmax_values]
lengths = []
for value in result['first_file']:
    file_path_1 = os.path.join(path_to_sim_vector_folder, value)
    file_df_1 = pd.read_csv(file_path_1)
    lengths.append(file_df_1.shape[0])
    
result['lengths'] = lengths

result.head()



In [None]:
idxmax_values = result.groupby(['cluster'])['percentage'].idxmax()
result_updated = result.loc[idxmax_values]
result_updated

In [None]:
df_lead = pd.read_csv(os.path.join(path_to_sim_vector_folder, 'buy.net_www.ilgs.net.csv'))

# Value to exclude from feature extraction
nan_replacement_value = 2 

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


df_lead = df_lead[to_consider_columns]
# convert "/" into Nan
df_lead = df_lead.apply(pd.to_numeric, errors= 'coerce')
col = ['Produktname_dic3', 'Modell_Liste_3g']
# Replace NaN values in the feature vector with a specific numerical value
df_lead.fillna(nan_replacement_value, inplace=True)

df_lead = df_lead[col]



df_test = pd.read_csv(os.path.join(path_to_sim_vector_folder, 'www.camerafarm.com.au_cammarkt.com.csv'))

# Value to exclude from feature extraction
nan_replacement_value = 2 

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


df_test = df_test[to_consider_columns]
# convert "/" into Nan
df_test = df_test.apply(pd.to_numeric, errors= 'coerce')
col = ['Produktname_dic3', 'Modell_Liste_3g']
# Replace NaN values in the feature vector with a specific numerical value
df_test.fillna(nan_replacement_value, inplace=True)

df_test = df_test[col]


ks_statistic, ks_p_value_produktname = ks_2samp(df_lead['Produktname_dic3'], df_test['Produktname_dic3'])
ks_statistic, ks_p_value_modell_list = ks_2samp(df_lead['Modell_Liste_3g'], df_test['Modell_Liste_3g'])
print(ks_p_value_produktname)
print(ks_p_value_modell_list)
values = [ks_p_value_produktname,ks_p_value_modell_list]
            
alpha = 0.05/2
            
# Check if any value is less than the threshold
any_value_below_threshold = any(value < alpha for value in values)

if any_value_below_threshold:
    print("reject null hypothesis")

else:
    print("do not reject null hypothesis")





 





In [None]:
cluster_zero = []
files_zero = []
performance_zero = []

cluster_one = []
files_one = []
performance_one = []

# 2. Build an XGBoost model and train it on the data of the dataframe with the most number of rows 
cluster_zero_file = pd.read_csv(path_to_sim_vector_folder + 'buy.net_www.ilgs.net.csv')
cluster_zero_file.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)

cluster_zero_file = cluster_zero_file[['Produktname_dic3', 'Modell_Liste_3g', 'is_match']]

# Replace "/" with 9999 in 'is_match' column
cluster_zero_file.replace('/', 2, inplace=True)

# Convert all columns to numerical data types 
cluster_zero_file = cluster_zero_file.apply(pd.to_numeric, errors='coerce')
# Assuming the last column is the target variable and the rest are features
X = cluster_zero_file.iloc[:, :-1] # Features (all columns except the last one)
y = cluster_zero_file.iloc[: , -1] # Taregt variable (is_match)

# Create an XGBoost classifier 
model_zero_cluter = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model in the taining data
model_zero_cluter.fit(X, y)

files_to_predict_by_cluster_zero = df.loc[df['first_file']== 'www.price-hunt.com_cammarkt.com.csv']

for file in files_to_predict_by_cluster_zero['second_file']:
    cluster_zero.append(0)
    files_zero.append(file)
    file_data = pd.read_csv(path_to_sim_vector_folder + file)
    file_data.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)

    file_data = file_data[['Produktname_dic3', 'Modell_Liste_3g', 'is_match']]

    # Replace "/" with 9999 in 'is_match' column
    file_data.replace('/', 2, inplace=True)

    # Convert all columns to numerical data types 
    file_data = file_data.apply(pd.to_numeric, errors='coerce')
    # Assuming the last column is the target variable and the rest are features
    X_test = file_data.iloc[:, :-1] # Features (all columns except the last one)
    y_test = file_data.iloc[: , -1] # Taregt variable (is_match)
    
    predictions = model_zero_cluter.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    performance_zero.append(accuracy)
   


df_cluster_zero = pd.DataFrame({
        'cluster': cluster_zero,
        'files': files_zero,
        'performance': performance_zero
     })

    

    
    

    
# 2. Build an XGBoost model and train it on the data of the dataframe with the most number of rows 
cluster_one_file = pd.read_csv(path_to_sim_vector_folder + 'buy.net_www.ilgs.net.csv')
cluster_one_file.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)

cluster_one_file = cluster_one_file[['Produktname_dic3', 'Modell_Liste_3g', 'is_match']]

# Replace "/" with 9999 in 'is_match' column
cluster_one_file.replace('/', 2, inplace=True)

# Convert all columns to numerical data types 
cluster_one_file = cluster_one_file.apply(pd.to_numeric, errors='coerce')
# Assuming the last column is the target variable and the rest are features
X = cluster_one_file.iloc[:, :-1] # Features (all columns except the last one)
y = cluster_one_file.iloc[: , -1] # Taregt variable (is_match)

# Create an XGBoost classifier 
model_one_cluter = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model in the taining data
model_one_cluter.fit(X, y)

files_to_predict_by_cluster_one = df.loc[df['first_file']== 'www.camerafarm.com.au_cammarkt.com.csv']

for file in files_to_predict_by_cluster_one['second_file']:
    cluster_one.append(0)
    files_one.append(file)
    file_data = pd.read_csv(path_to_sim_vector_folder + file)
    file_data.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20','Unnamed: 0','recId','recId.1'], axis=1, inplace=True)

    file_data = file_data[['Produktname_dic3', 'Modell_Liste_3g', 'is_match']]

    # Replace "/" with 9999 in 'is_match' column
    file_data.replace('/', 2, inplace=True)

    # Convert all columns to numerical data types 
    file_data = file_data.apply(pd.to_numeric, errors='coerce')
    # Assuming the last column is the target variable and the rest are features
    X_test = file_data.iloc[:, :-1] # Features (all columns except the last one)
    y_test = file_data.iloc[: , -1] # Taregt variable (is_match)
    
    predictions = model_one_cluter.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    performance_one.append(accuracy)
   


df_cluster_one = pd.DataFrame({
        'cluster': cluster_one,
        'files': files_one,
        'performance': performance_one
     })




    

 
    
    
    
    

In [None]:
df_cluster_one['performance'].median()

In [None]:
# specify the path of the data folder 
path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

# Specify the folder path containing the csv files 
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'

# Get a list of all CSV files in the folder 
csv_files = [file for file in os.listdir(path_to_sim_vector_folder) if file.endswith('.csv')]

# Columns to extracte from feature extraction 
to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                       'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']


first_file = []
second_file = []
ks_statistc_produkt_name = []
ks_statistic_model_list = []
similiar = []

for csv_file_1 in csv_files:
    print(f"First dataset is {csv_file_1} ")
    file_path_1 = os.path.join(path_to_sim_vector_folder, csv_file_1)
    file_df_1 = pd.read_csv(file_path_1)
    file_df_1 = file_df_1[to_consider_columns]
    file_df_1 = file_df_1.apply(pd.to_numeric, errors = 'coerce')
    file_df_1.fillna(2,inplace=True)
    
    for csv_file_2 in csv_files:
        file_path_2 = os.path.join(path_to_sim_vector_folder, csv_file_2)
        if file_path_1 != file_path_2:
            print(f"second dataset is {csv_file_2}")
            first_file.append(csv_file_1)
            second_file.append(csv_file_2)
            file_df_2 = pd.read_csv(file_path_2)
            file_df_2 = file_df_2[to_consider_columns]
            file_df_2 = file_df_2.apply(pd.to_numeric, errors = 'coerce')
            file_df_2.fillna(2,inplace=True)
            
            ks_statistic_produktname, ks_p_value_produktname = ks_2samp(file_df_1['Produktname_dic3'], file_df_2['Produktname_dic3'])
            ks_statistic_modell_list, ks_p_value_modell_list = ks_2samp(file_df_1['Modell_Liste_3g'], file_df_2['Modell_Liste_3g'])
                        
                        
            ks_statistic_MPN_Liste_TruncateBegin20, ks_p_value_MPN_Liste_TruncateBegin20 = ks_2samp(file_df_1['MPN_Liste_TruncateBegin20'], file_df_2['MPN_Liste_TruncateBegin20'])
            ks_statistic_EAN_Liste_TruncateBegin20, ks_p_value_EAN_Liste_TruncateBegin20 = ks_2samp(file_df_1['EAN_Liste_TruncateBegin20'], file_df_2['EAN_Liste_TruncateBegin20'])

            
            ks_statistic_Digital_zoom_NumMaxProz30, ks_p_value_Digital_zoom_NumMaxProz30 = ks_2samp(file_df_1['Digital_zoom_NumMaxProz30'], file_df_2['Digital_zoom_NumMaxProz30'])
            ks_statistic_optischer_zoom_NumMaxProz30, ks_p_value_optischer_zoom_NumMaxProz30 = ks_2samp(file_df_1['optischer_zoom_NumMaxProz30'], file_df_2['optischer_zoom_NumMaxProz30'])
            
                        
            ks_statistic_Höhe_NumMaxProz30, ks_p_value_Höhe_NumMaxProz30 = ks_2samp(file_df_1['Höhe_NumMaxProz30'], file_df_2['Höhe_NumMaxProz30'])
            ks_statistic_Breite_NumMaxProz30, ks_p_value_Breite_NumMaxProz30 = ks_2samp(file_df_1['Breite_NumMaxProz30'], file_df_2['Breite_NumMaxProz30'])
              
            ks_statistic_Gewicht_NumMaxProz30, ks_p_value_Gewicht_NumMaxProz30 = ks_2samp(file_df_1['Gewicht_NumMaxProz30'], file_df_2['Gewicht_NumMaxProz30'])
            ks_statistic_Sensortyp_Jaccard3, ks_p_value_Sensortyp_Jaccard3 = ks_2samp(file_df_1['Sensortyp_Jaccard3'], file_df_2['Sensortyp_Jaccard3'])
              
            
                
            values = [ks_p_value_produktname,ks_p_value_modell_list,ks_p_value_MPN_Liste_TruncateBegin20,ks_p_value_EAN_Liste_TruncateBegin20,
                      ks_p_value_Digital_zoom_NumMaxProz30,ks_p_value_optischer_zoom_NumMaxProz30,ks_p_value_Höhe_NumMaxProz30,ks_p_value_Breite_NumMaxProz30,
                      ks_p_value_Gewicht_NumMaxProz30,ks_p_value_Sensortyp_Jaccard3]
            ks_statistc_produkt_name.append(ks_statistic_produktname)
            ks_statistic_model_list.append(ks_statistic_modell_list)
            
            alpha = 0.05/10
            
            # Check if any value is less than the threshold
            any_value_below_threshold = any(value < alpha for value in values)

            # Print the result
            if any_value_below_threshold:
                similiar.append(0)
            else:
                similiar.append(1)

            
            
df = pd.DataFrame({'first_file': first_file, 'second_file': second_file,'ks_statistc_produkt_name':ks_statistc_produkt_name, 'ks_statistic_model_list':ks_statistic_model_list ,'similiar':similiar})




In [None]:
d, d_1 = generate_data_representation_very_new_test()
d.to_csv('test_new.csv')

# TEST VERY NEW 

# Import libraries

In [None]:
import os
import random
import community
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
from networkx.algorithms.community import asyn_lpa_communities
from networkx.algorithms.community.centrality import girvan_newman
from sklearn.metrics import precision_score, recall_score, f1_score

# Set Folder Pathes

In [None]:
# specify the path of the data folder 
path_to_data_folder = '/Users/abdulnaser/Desktop/Masterarbeit/metadatatransferlearning-main/meta_tl/data/'

# Specify the folder path containing the csv files 
path_to_sim_vector_folder =  path_to_data_folder + 'sim_dataframes/'
# www.ebay.com_www.pcconnection.com.csv    www.camerafarm.com.au_cammarkt.com.csv 	
www.ebay.com_www.pcconnection.com.csv      www.camerafarm.com.au_www.priceme.co.nz.csv 	

In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.camerafarm.com.au_cammarkt.com.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed


# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_camerafarm_cammarkt')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()


In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.eglobalcentral.co.uk_www.priceme.co.nz.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()



In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.eglobalcentral.co.uk_www.priceme.co.nz.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()



In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.garricks.com.au_www.ebay.com.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()



In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Read and prepare the dataframes
temp_data_1 = prepare_dataframe_to_similarity_comparison(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = prepare_dataframe_to_similarity_comparison(path_to_sim_vector_folder + 'www.garricks.com.au_www.ebay.com.csv')

# Drop rows with NaN values in the specified columns
temp_data_1 = temp_data_1.dropna(subset=numeric_columns)
temp_data_2 = temp_data_2.dropna(subset=numeric_columns)

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Modell_Liste_3g'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Modell_Liste_3g'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Modell_Liste_3g_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Modell_Liste_3g_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()


In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.garricks.com.au_www.ebay.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.eglobalcentral.co.uk.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()



In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Read and prepare the dataframes
temp_data_1 = prepare_dataframe_to_similarity_comparison(path_to_sim_vector_folder + 'www.garricks.com.au_www.ebay.com.csv')
temp_data_2 = prepare_dataframe_to_similarity_comparison(path_to_sim_vector_folder + 'www.ebay.com_www.eglobalcentral.co.uk.csv')

# Drop rows with NaN values in the specified columns
temp_data_1 = temp_data_1.dropna(subset=numeric_columns)
temp_data_2 = temp_data_2.dropna(subset=numeric_columns)

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Modell_Liste_3g'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Modell_Liste_3g'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Modell_Liste_3g_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Modell_Liste_3g_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()


In [None]:
temp_data_1 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.pcconnection.com.csv')
temp_data_2 = pd.read_csv(path_to_sim_vector_folder + 'www.ebay.com_www.eglobalcentral.co.uk.csv')

# Replace 'column_name' with the actual column you want to plot
ecdf_1 = sm.distributions.ECDF(temp_data_1['Produktname_dic3'])
ecdf_2 = sm.distributions.ECDF(temp_data_2['Produktname_dic3'])

# Interpolate CDFs to have the same set of points
x_values = np.unique(np.concatenate([ecdf_1.x, ecdf_2.x]))
y_values_1 = np.interp(x_values, ecdf_1.x, ecdf_1.y, left=0, right=1)
y_values_2 = np.interp(x_values, ecdf_2.x, ecdf_2.y, left=0, right=1)

# Set the size of the figure
plt.figure(figsize=(10, 6))  # Adjust the width and height as needed
# Plot the interpolated CDFs
plt.plot(x_values, y_values_1, label='Produktname_dic3_ebay_pcconnection')
plt.plot(x_values, y_values_2, label='Produktname_dic3_eglobalcentral_priceme')

# Calculate the maximum absolute difference
max_abs_diff = np.max(np.abs(y_values_1 - y_values_2))
print(max_abs_diff)

# Find the index of the maximum absolute difference
max_diff_index = np.argmax(np.abs(y_values_1 - y_values_2))

# Plot a line at the maximum difference point
plt.axvline(x=x_values[max_diff_index], color='red', linestyle='--', label='Max Difference')

plt.xlabel('Similarity Value')
plt.ylabel('Cumulative Probability')
plt.title('Interpolated Cumulative Distribution Function (CDF) with Max Difference Line')
plt.legend()
plt.show()



In [None]:
ks_2samp(temp_data_1['Produktname_dic3'], temp_data_2['Produktname_dic3'])

In [None]:
temp_data_2['Modell_Liste_3g'].dropna(inplace=True)


In [None]:
temp_data_2['Modell_Liste_3g'].dropna()


In [None]:
first_files_rows_nums = []
second_files_rows_nums = []

for index, row in similarity_df_second_community.iterrows():
    first_file_val = row['first_file']
    second_file_val = row['second_file']
    
    first_file_path = os.path.join(path_to_sim_vector_folder, first_file_val)
    second_file_path = os.path.join(path_to_sim_vector_folder, second_file_val)
    
    first_file_df = pd.read_csv(first_file_path)
    second_file_df = pd.read_csv(second_file_path)
    
    first_files_rows_nums.append(first_file_df.shape[0])
    second_files_rows_nums.append(second_file_df.shape[0])
    
similarity_df_second_community['first_file_rows_num'] = first_files_rows_nums 
similarity_df_second_community['second_file_rows_num'] = second_files_rows_nums

similarity_df_second_community.head()


    

In [None]:
# For each file specify the file that is similiar to it and has the best 
# average mean with all other files.
file_belong_to = {}
unique_first_file_values = similarity_df_second_community['first_file'].unique()
for file in unique_first_file_values:
    temp_second_file_df = similarity_df_second_community.loc[similarity_df_second_community['first_file']==file][['second_file','first_file_rows_num']]
    for index, row in temp_second_file_df.iterrows():
         if row['second_file'] not in file_belong_to:
                file_belong_to[row['second_file']] = [file,row['first_file_rows_num']]
         else:
                val_2 = file_belong_to[row['second_file']][1]
                if row['first_file_rows_num'] > val_2:
                    file_belong_to[row['second_file']] = [file,row['first_file_rows_num']]

                    
# Extract the best files and their count                    
files = []
belong_to = []  
rows_num = []
for key, value in file_belong_to.items(): 
    files.append(value[0])
    belong_to.append(key)
    rows_num.append(value[1])
    
    
# Create a DataFrame from the two lists
cluster_files_df = pd.DataFrame({'first_file': files, 'belong_to': belong_to, 'rows_num':rows_num})
cluster_files_df_grouped = cluster_files_df.groupby(['first_file', 'rows_num']).count().reset_index()
cluster_files_df_grouped = cluster_files_df_grouped.sort_values(by='belong_to', ascending=False)
cluster_files_df_grouped
        
        

In [None]:
# Create a DataFrame from the two lists
cluster_files_df = pd.DataFrame({'first_file': files, 'belong_to': belong_to, 'rows_num':rows_num})
cluster_files_df_grouped = cluster_files_df.groupby(['first_file', 'rows_num']).count().reset_index()
cluster_files_df_grouped = cluster_files_df_grouped.sort_values(by='belong_to', ascending=False)
cluster_files_df_grouped



In [None]:
columns_to_consider = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                       'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 
                       'Breite_NumMaxProz30','Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 
                       'Sensortyp_Jaccard3','is_match']

overall_accuracy = []

for index, row in cluster_files_df_grouped.iterrows():
     to_process_file = row['first_file']
     print(f"To process training file is {to_process_file}")
     coressponding_to_process_files = cluster_files_df.loc[cluster_files_df['first_file'] == to_process_file]
     to_process_files_sorted = coressponding_to_process_files.sort_values(by='rows_num', ascending=False)['belong_to']
        

     threshold = 0.94
     counter = 1
    
     # Train the model on the most general dataset 'www.camerafarm.com.au_cammarkt.com.csv'
     cluster_file = pd.read_csv(path_to_sim_vector_folder + to_process_file)
     cluster_df = prepare_dataframe(cluster_file)
        
     X = cluster_df.iloc[:, :-1] 
     y = cluster_df.iloc[: , -1] 
     model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
     model.fit(X, y)

     # Iterate over the files in the Cluster
     for file in to_process_files_sorted:
         # Prepare the data of the file
         file_data = pd.read_csv(path_to_sim_vector_folder + file)
         file_df = prepare_dataframe(file_data)
         X = file_df.iloc[:, :-1] # Features (all columns except the last one)
         y = file_df.iloc[: , -1] # Taregt variable (is_match)   

         # prediction
         predictions = model.predict(X)
         class_probs = model.predict_proba(X)
         file_df['pred'] = predictions
         file_df[['probabilties_0', 'probabilties_1']] = class_probs


         # Calculate the absoulte difference between 'probabilties_0' and 'probabilties_1'
         file_df['AbsDiff'] = abs(file_df['probabilties_0'] - file_df['probabilties_1'])
         file_df['signifikant_diff'] = (file_df['AbsDiff'] > threshold).astype(int)
         strong_preds = file_df.loc[file_df['signifikant_diff']==1].shape[0]
         weak_preds = file_df.loc[file_df['signifikant_diff']==0].shape[0]
         print(f"The ratio of weak to strong preds is {weak_preds/strong_preds}")
        
        
         # Calculate the F1-Score
         F1 = f1_score(file_df['is_match'], file_df['pred'])
         print(f"The Accuracy before active learning is {F1}")

         file_df.loc[file_df['signifikant_diff']==0, 'pred'] = file_df['is_match']
        
         F1 = f1_score(file_df['is_match'], file_df['pred'])
         print(f"The Accuracy after active learning is {F1}")
         overall_accuracy.append(F1)  
     
     
    
    
    

In [None]:
np.mean(overall_accuracy)

In [None]:
def compute_mean_similarity(values_list):
    
    weights = [30, 30, 10, 10, 2, 2, 2, 2, 2, 2]
    
    new_values = [1 - x for x in values_list]
    
    indices_of_3 = [index for index, value in enumerate(new_values) if value == 3]
    
    new_values_filtered = [value for index, value in enumerate(new_values) if index not in indices_of_3]
    weights_filtered = [value for index, value in enumerate(weights) if index not in indices_of_3]
    
    weighted_mean_similarity = np.average(new_values_filtered, weights=weights_filtered)
    
    return weighted_mean_similarity




In [None]:
def prepare_dataframe_to_similarity_comparison(file_path):
    
    # Set a threshold for the percentage of NaN values
    threshold_percentage = 70
    
    # Columns to extracte from feature extraction 
    to_consider_columns = ['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
                           'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
                           'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3']
    
    df = pd.read_csv(file_path)
    df = df[to_consider_columns]
    # convert "/" into Nan
    df = df.apply(pd.to_numeric, errors= 'coerce')
    # Filter columns where the percentage of NaN values is over the threshold
    filtered_columns = df.columns[df.isna().mean() < threshold_percentage / 100]
    df = df[filtered_columns]
    
    return df
    
     

    

In [None]:
def prepare_dataframe(df):
    df.drop(columns=['record_compared_1','record_compared_2','Modell_no_Liste_TruncateBegin20'], axis=1, inplace=True)
    
    df = df[['MPN_Liste_TruncateBegin20','EAN_Liste_TruncateBegin20','Produktname_dic3',
             'Modell_Liste_3g','Digital_zoom_NumMaxProz30','optischer_zoom_NumMaxProz30', 'Breite_NumMaxProz30', 
             'Höhe_NumMaxProz30', 'Gewicht_NumMaxProz30', 'Sensortyp_Jaccard3', 'is_match']]

    # Replace "/" with 9999 in 'is_match' column
    df.replace('/', 2, inplace=True)
    df.replace('/', np.nan, inplace=True)

    # Convert all columns to numerical data types 
    df = df.apply(pd.to_numeric, errors='coerce')
    
    return df

    