# PRE-PROCESSING

In [52]:
import os
import json
from collections import defaultdict


In [19]:
''' This function takes as input the folder where the sources are located and returns 
a dictionary where for each table it stores all the fields and 2 values for each'''
def collect_attributes(base_path):
    sources_attributes = defaultdict(lambda: defaultdict(list))

    # Scroll to each folder (source) in the base folder
    for source in os.listdir(base_path):
        source_path = os.path.join(base_path, source)
        
        # Check if is a folder
        if os.path.isdir(source_path):
            # for each folder in sources directory
            for file_name in os.listdir(source_path):
                file_path = os.path.join(source_path, file_name)
                
                # Check if is a json file
                if os.path.isfile(file_path) and file_name.endswith('.json'):
                    with open(file_path, 'r', encoding='utf-8') as file:
                        try:
                            record = json.load(file)
                            for key, value in record.items():
                                # Add for each attribute 2 example values
                                if len(sources_attributes[source][key]) < 2:
                                    sources_attributes[source][key].append(value)
                        except json.JSONDecodeError:
                            print(f"Errore nel decodificare il file: {file_path}")

    return sources_attributes

In [20]:
# test collect_attributes function
base_path = './sources'

attributes_per_source = collect_attributes(base_path)

for source, attributes in attributes_per_source.items():
    print(f"Sorgente: {source}")
    print("Attributi trovati:")
    for attribute in attributes:
        print(f"  - {attribute}")

Sorgente: ca.pcpartpicker.com
Attributi trovati:
  - <page title>
  - aspect ratio
  - builtin speakers
  - contrast ratio
  - display colors
  - dvi
  - hdmi
  - ips
  - led
  - manufacturer
  - part
  - recommended resolution
  - refresh rate
  - screen size
  - viewing angle
  - widescreen
  - brightness
  - displayport
  - model
  - response time
  - vga
  - component
  - dvid duallink
  - svideo
  - minidisplay port
Sorgente: catalog.com
Attributi trovati:
  - <page title>
  - aspect ratio
  - contrast ratio
  - display technology
  - max resolution
  - product line
  - refresh rate
  - max viewing angle horizontal
  - max viewing angle vertical
  - regulatory approval
  - synchronization range  horizontal
  - system
  - product features
  - color support
  - synchronization range  vertical
Sorgente: ce.yikus.com
Attributi trovati:
  - <page title>
  - brand
  - category
  - ean
  - feature
  - group
  - hardwareplatform
  - label
  - manufacturer
  - model
  - mpn
  - operatingsy

In [5]:
attributes_per_source

defaultdict(<function __main__.collect_attributes.<locals>.<lambda>()>,
            {'ca.pcpartpicker.com': defaultdict(list,
                         {'<page title>': ['Planar 997-7273-00 60Hz 32.0" Monitor (997-7273-00) - PCPartPicker Canada',
                           'Lenovo LT1952p 19.0" Monitor (2448MB6) - PCPartPicker Canada'],
                          'aspect ratio': ['16:9', '16:10'],
                          'builtin speakers': ['No', 'No'],
                          'contrast ratio': ['3000:1', '1000:1'],
                          'display colors': ['16700000', '16700000'],
                          'dvi': ['1', '1'],
                          'hdmi': ['1', '1'],
                          'ips': ['No', 'No'],
                          'led': ['No', 'Yes'],
                          'manufacturer': ['Planar', 'Lenovo'],
                          'part': ['997-7273-00', '2448MB6'],
                          'recommended resolution': ['1920 x 1080',
                         

In [6]:
len(attributes_per_source)

26

In [17]:
''' function to save the attribute dictionary to a json file '''
def save_attributes_to_json(attributes, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(attributes, file, ensure_ascii=False, indent=4)


In [18]:
save_attributes_to_json(attributes_per_source, "attributes_per_source.json")

## Indexing Phase for attributes matching with LLM 

In [21]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from groq import Groq

In [22]:
len(attributes_per_source)

26

In [23]:
chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192")

In [8]:
str(attributes_per_source["ca.pcpartpicker.com"]["<page title>"])

'[\'Planar 997-7273-00 60Hz 32.0" Monitor (997-7273-00) - PCPartPicker Canada\', \'Lenovo LT1952p 19.0" Monitor (2448MB6) - PCPartPicker Canada\']'

In [25]:
''' 
This part makes calls to an LLM (model Llama3) via Groq to generate a description of up to
7 words for each field
'''
response_dict = defaultdict(dict)
for source, attributes in attributes_per_source.items():
    print(source)
    for attribute, values in attributes.items():
        values_string = str(values)
        
        # Template 
        template = """You are an assistant who must help me to analyze the fields of a table concerning monitor characteristics.
            Based only on the values of the field, you have to give us a description in natural language of up to 7 words for this field.
            All I want to output is ONLY the description of the field.
            You have to return only the description without any messages.
            If field is empty, not return the description for this field
            Do what I said on this field with few values and the name of the field: Name: {name}, Values: {values}
            """
    
        # Create the prompt from the template
        prompt = ChatPromptTemplate.from_messages([("human", template)])
        chain = prompt | chat
        response = chain.invoke({"name": attribute, "values": values_string})
        response_dict[source][attribute] = response.content
    
    with open("attribute_description_2.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_element_key = source
    new_element_value = response_dict[source]

    data[new_element_key] = new_element_value
    # Save the updated dictionary in the JSON file
    with open("attribute_description_2.json", 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)



ca.pcpartpicker.com
catalog.com
ce.yikus.com
www.best-deal-items.com
www.cleverboxes.com
www.ebay.com
www.getprice.com.au
www.hardware-planet.it
www.imldirect.it
www.itenergy.co.uk
www.jrlinton.co.uk
www.kingsfieldcomputers.co.uk
www.makingbuyingeasy.co.uk
www.mediashopuk.com


KeyboardInterrupt: 

In [33]:
with open("attributes_description_mediated_schema.json", 'r', encoding='utf-8') as file:
        mediated_schema = json.load(file)

In [34]:
mediated_schema

{'mediated_schema': {'bluetooth': 'Bluetooth capability of the monitor',
  'brand': 'Brand of the monitor',
  'color': 'Color of the monitor',
  'color_feet': "Color of the monitor's feet",
  'contrast_ratio_dynamic': 'Dynamic contrast ratio',
  'contrast_ratio_static': 'Static contrast ratio',
  'digital_horizontal_refresh_rate_max': 'Max horizontal refresh rate',
  'digital_horizontal_refresh_rate_range': 'Range of horizontal refresh rate',
  'digital_vertical_refresh_rate_max': 'Max vertical refresh rate',
  'digital_vertical_refresh_rate_range': 'Range of vertical refresh rate',
  'displayport_quantity': 'Number of DisplayPort inputs',
  'dvi_port_quantity': 'Number of DVI ports',
  'dvid_port_quantity': 'Number of DVI-D ports',
  'dvii_port_quantity': 'Number of DVI-I ports',
  'ethernet_port_quantity': 'Number of Ethernet ports',
  'has_displayport': 'Has DisplayPort input',
  'has_dvi_port': 'Has DVI port',
  'has_dvid_port': 'Has DVI-D port',
  'has_ethernet_port': 'Has Etherne

### EMBEDDINGS ATTRIBUTES

In [27]:
import ast
from transformers import AutoTokenizer, AutoModel
import torch
import numpy

  from .autonotebook import tqdm as notebook_tqdm


In [53]:
with open("attributes_description.json", 'r', encoding='utf-8') as file:
        table_attributes_description = json.load(file)


In [54]:
# Initialize the embeddings model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")

In [55]:
'''
This function takes a string and returns the associated vector calculated by the model
'''
def embed_sentence(sentence):
        # Embedding of the sentence
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings.squeeze().numpy()

In [64]:
string = "i love youuuuuuuuu"
len(string) > 2
string = string[:7]
string

'i love '

In [65]:
def generate_jsonfile_with_embeddings(file_path, table_attributes_description):

    os.makedirs(file_path, exist_ok=True)
    for table, attributes in table_attributes_description.items():
        for attribute, description in attributes.items():
            # For each attribute our related json file will have these fields
            jsonObject = {
                "table": table,
                "nameAttribute": attribute,
                "descriptionAttribute": description
            }
            
            
            attribute_description_string = f"{attribute} {description}"

            # Embedding attribute
            attribute_embedding = embed_sentence(attribute_description_string)
            attribute_embedding_list = attribute_embedding.tolist()
            jsonObject["embeddingAttribute"] = attribute_embedding_list

            # Generate the json file
            if len(attribute) > 20:
                attribute = attribute[:15]
            
            file_name = f"{table}_{attribute}.json"
            file_name = file_name.replace(" ", "_")
            file_name = file_name.replace("/", "_")
            file_name = file_name.replace("<", "")
            file_name = file_name.replace(">", "")
            file_name = file_name.replace("-", "_")
            full_path = os.path.join(file_path, file_name)

            

            # Save jsonObject to a json file
            with open(full_path, 'w', encoding='utf-8') as json_file:
                json.dump(jsonObject, json_file, ensure_ascii=False, indent=4)
            

In [66]:
# For attributes of sorces
file_path = "./attributes_documents"
generate_jsonfile_with_embeddings(file_path, table_attributes_description)

ca.pcpartpicker.com
catalog.com
ce.yikus.com
www.best-deal-items.com
www.cleverboxes.com
www.ebay.com
www.getprice.com.au
www.hardware-planet.it
www.imldirect.it
www.itenergy.co.uk
www.jrlinton.co.uk
www.kingsfieldcomputers.co.uk
www.makingbuyingeasy.co.uk
www.shopmania.com
www.mediashopuk.com
www.nexus-t.co.uk
www.pc-canada.com
www.officedepot.com
www.ohc24.ch
www.pcconnection.com
www.planet-computer.it
www.mrhightech.com
www.softwarecity.ca
www.vology.com
www.odsi.co.uk
www.xpcpro.com


In [67]:
# For attributes of mediated_schema
file_path = "./attributes_target_documents"
generate_jsonfile_with_embeddings(file_path, mediated_schema)

mediated_schema


### (3) RANK TOP j MEDIATED_SCHEMA ATTRIBUTES FOR EACH SOURCES ATTRIBUTES

In [68]:
import numpy as np
from numpy.linalg import norm

In [69]:
''' Cosine Similarity function'''
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

In [70]:
''' 
This function takes as input: a sources attribute in json format, an integer j, a threshold and 
the path to the folder where the json files relating to the mediated_schema attributes are located.
It returns the name, description and similarity value of (at most) j documents (attributes) most 
similar to the input sources attribute whose similarity is greater than the threshold
'''
def rank_top_j_attributes(attribute_document_json, j, threshold_similarity, mediated_schema_dir):
    
    embedding_value = attribute_document_json["embeddingAttribute"]

    similarities = []

    for filename in os.listdir(mediated_schema_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(mediated_schema_dir, filename)
            with open(file_path) as file:
                target_attribute_document = json.load(file)
            target_embedding_value = target_attribute_document["embeddingAttribute"]
            similarity = cosine_similarity(embedding_value, target_embedding_value)
            if similarity > threshold_similarity:
                similarities.append((similarity, target_attribute_document["nameAttribute"], target_attribute_document["descriptionAttribute"]))
    
    similarities.sort(reverse=True, key=lambda x: x[0])
    top_k_similarities = similarities[:j]

    return [(name, description, similarity) for similarity, name, description in top_k_similarities]  

In [72]:
attribute_top_k_similar = defaultdict(dict)

for filename in os.listdir("./attributes_documents"):
    if filename.endswith(".json"):
        file_path = os.path.join("./attributes_documents", filename)
        with open(file_path) as file:
            attribute_document_json = json.load(file)
        table = attribute_document_json["table"]
        name = attribute_document_json["nameAttribute"]
        attribute_top_k_similar[table][name] = rank_top_j_attributes(attribute_document_json, 3, 0.5, "./attributes_target_documents")

In [73]:
attribute_top_k_similar

defaultdict(dict,
            {'ca.pcpartpicker.com': {'aspect ratio': [('supported_aspect_ratio',
                'Supported aspect ratio',
                0.6762522809882845),
               ('screen_size_vertical',
                'Vertical screen size',
                0.5791082953633123),
               ('monitor_dimension',
                'Dimensions of the monitor',
                0.5786722959573334)],
              'brightness': [('screen_brightness',
                'Screen brightness level',
                0.9643941088224501)],
              'builtin speakers': [('has_speakers',
                'Has built-in speakers',
                0.7551089339073914),
               ('speakers_quantity',
                'Number of speakers',
                0.6000077218596388)],
              'component': [],
              'contrast ratio': [('contrast_ratio_static',
                'Static contrast ratio',
                0.7703891972290394)],
              'displayport': [('has_displ

### (4) FINAL SCHEMA-MATCHING WITH LLM

In [71]:
from collections import defaultdict

In [104]:
chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192")

In [113]:
'''
This function takes as input: the path to the folder where the json files relating to 
the mediated_schema attributes are located, the path to the folder where the json files
relating to the sources attributes are located, an integer j and a threshold.
For each source attribute, it calls the rank_top_j_attributes function and for each
source table, it invokes the llm to choose the best matching of the top j.
It the function returns a dictionary of dictionaries where for each attribute of each
source table there is an associated mediated_schema field with which it matches or 
'no matching' if it does not match any mediated_schema field
'''
def re_match(mediated_schema_dir, attributes_documents_dir, j, threshold_similarity):

    attribute_top_k_similar = defaultdict(dict)

    for filename in os.listdir(attributes_documents_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(attributes_documents_dir, filename)
            with open(file_path) as file:
                attribute_document_json = json.load(file)

            table = attribute_document_json["table"]
            name = attribute_document_json["nameAttribute"]
            attribute_top_k_similar[table][name] = rank_top_j_attributes(attribute_document_json, j, threshold_similarity, mediated_schema_dir)
    
    attribute_matching_result = defaultdict(dict)
    for table, attributes_dict in attribute_top_k_similar.items():
        print(table)
        for attribute, most_similar_attributes in attributes_dict.items():
                
            most_similar_attributes_string = str(most_similar_attributes)
            # Template 
            template = """You are an assistant helping with field matching. Below is a field and its potential matches in the target table, 
                        along with descriptions and similarity values.

                        Using this information, determine if the field matches any proposed fields. If a match is found, return the name    
                        of the matching field. If no match is found, return 'no matching'.

                        Output should be ONLY the name of the matching field, or 'no matching'.

                        **IMPORTANT**: The output must be ONLY the name of one target field in the Possible Matches list. Do not include any additional text or explanations.

                        Example:
                        Input: particular_brand
                        Output: brand

                        Example:
                        Input: component
                        Output: no matching

                        Do what I said on this field with this possible field matches: Field Name: {attribute}, Possible Matches: {most_similar_attributes}

                
            """
            
            # Create prompt for the llm using the template
            prompt = ChatPromptTemplate.from_messages([("human", template)])
            chain = prompt | chat
            response = chain.invoke({"attribute": attribute, "most_similar_attributes": most_similar_attributes_string})
            attribute_matching_result[table][attribute] = response.content
        
        with open("final_schema_matching.json", 'r', encoding='utf-8') as file:
            data = json.load(file)

        new_element_key = table
        new_element_value = attribute_matching_result[table]

        data[new_element_key] = new_element_value
        # Save the updated dictionary in the JSON file
        with open("final_schema_matching.json", 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        
        break
        
    return attribute_matching_result


In [114]:
mediated_schema_dir = "./attributes_target_documents"
attributes_documents_dir = "./attributes_documents"
j = 3
threshold_similarity = 0.5
result = re_match(mediated_schema_dir, attributes_documents_dir, j, threshold_similarity)

ca.pcpartpicker.com


In [108]:
result

defaultdict(dict,
            {'ca.pcpartpicker.com': {'aspect ratio': 'aspect_ratio',
              'brightness': 'screen_brightness',
              'builtin speakers': 'speakers',
              'component': 'no matching',
              'contrast ratio': 'contrast_ratio_static',
              'displayport': 'displayport',
              'display colors': 'color',
              'dvi': 'monitor',
              'dvid duallink': 'no matching',
              'hdmi': 'hdmi',
              'ips': 'no matching',
              'led': 'no matching',
              'manufacturer': 'brand',
              'minidisplay port': 'usb_port',
              'model': 'model',
              '<page title>': 'monitor_width',
              'part': 'no matching',
              'recommended resolution': 'recommended resolution',
              'refresh rate': 'refresh',
              'response time': 'response_time',
              'screen size': 'screen_size',
              'svideo': 'no matching',
              '

### Evaluation

In [None]:
###
# true_positive = l'attributo matching nel file excel è lo stesso in quello ottenuto da noi
# true_negative = tutti gli attributi che non stanno nell'excel e che hanno come valore no matching
# false_positive = non è presente nel file excel e viene matchato da noi
# false_negative = l'attributo fatto da noi presenta no matching ma è presente nell file excel

In [None]:
''' 
Due to the limitation of tokens granted by Groq for the use of the llama3-8b-8192 model
it is not possible to work with such a large dataset with our strategy (inspired by ReMatch).
With an api_key for full access to an LLM we could have completed this task.
Our system was however implemented on a smaller dataset related to companies in the Companies_dataset directory 
'''

In [25]:
for key, value in response_dict.items():    
# Load the contents of the existing JSON file
    with open("attributes_description.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_element_key = key
    new_element_value = value

    data[new_element_key] = new_element_value
    # Save the updated dictionary in the JSON file
    with open("attributes_description.json", 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
