In [9]:
import os
import pandas as pd
import json

In [17]:
def read_tables_from_folder(folder):
    data = {}
    file_list = os.listdir(folder)
    
    for file in file_list:
        if file.endswith(".csv"):
            table_name = os.path.splitext(file)[0]
            file_path = os.path.join(folder, file)
            
            # Read the CSV file using pandas
            try:
                df = pd.read_csv(file_path)
            except pd.errors.EmptyDataError:
                print(f"Error: File '{file}' is empty.")
                continue
            
            # Extract attributes and 5 non-NaN values for each attribute
            attributes = {}
            for column in df.columns:
                # Skip NaN values and take the first 5 non-NaN values
                non_nan_values = df[column].dropna().head(3).tolist()
                attributes[column] = non_nan_values
            
            # Add to the main dictionary
            data[table_name] = attributes
    
    return data

In [18]:
# Example usage
folder_with_csv = "./sources"
result = read_tables_from_folder(folder_with_csv)

# Print the result to examine
for table, attributes in result.items():
    print(f"Table: {table}")
    for attribute, values in attributes.items():
        print(f"  Attribute: {attribute}")
        print(f"    First 5 values: {values}")

Table: avengers-hitHorizons
  Attribute: id
    First 5 values: ['d60823f20d864692b517a8ad6c1418ed', 'c7856532ca8b4dd286a9602c77221035', 'e8e04ae3e4594e3b836e62077155b19a']
  Attribute: name
    First 5 values: ['Enel Spa', 'Esselunga Spa', 'Superit Srl']
  Attribute: address
    First 5 values: ['VIALE REGINA MARGHERITA 137, ROMA, 00198, ROMA, ITALY', 'VIA GIAMBOLOGNA 1, PIOLTELLO, 20096, MILANO, ITALY', 'VIA VITTOR PISANI 20, MILANO, 20124, MILANO, ITALY']
  Attribute: nation
    First 5 values: ['IT00934061003', 'IT04916380159', 'IT11068950960']
  Attribute: hhid
    First 5 values: ['H-IT0031426662', 'H-IT0068388950', 'H-IT0289899385']
  Attribute: industry
    First 5 values: ['Finance, Insurance, and Real Estate', 'Retail Trade', 'Finance, Insurance, and Real Estate']
  Attribute: sic_code
    First 5 values: ['Holding company (6719)', 'Ret mail-order house (5961)', 'Holding company (6719)']
  Attribute: type
    First 5 values: ['Corporation', 'Corporation', 'Corporation']
  Att

In [19]:
result

{'avengers-hitHorizons': {'id': ['d60823f20d864692b517a8ad6c1418ed',
   'c7856532ca8b4dd286a9602c77221035',
   'e8e04ae3e4594e3b836e62077155b19a'],
  'name': ['Enel Spa', 'Esselunga Spa', 'Superit Srl'],
  'address': ['VIALE REGINA MARGHERITA 137, ROMA, 00198, ROMA, ITALY',
   'VIA GIAMBOLOGNA 1, PIOLTELLO, 20096, MILANO, ITALY',
   'VIA VITTOR PISANI 20, MILANO, 20124, MILANO, ITALY'],
  'nation': ['IT00934061003', 'IT04916380159', 'IT11068950960'],
  'hhid': ['H-IT0031426662', 'H-IT0068388950', 'H-IT0289899385'],
  'industry': ['Finance, Insurance, and Real Estate',
   'Retail Trade',
   'Finance, Insurance, and Real Estate'],
  'sic_code': ['Holding company (6719)',
   'Ret mail-order house (5961)',
   'Holding company (6719)'],
  'type': ['Corporation', 'Corporation', 'Corporation'],
  'est_of_ownership': [1962, 1957, 2019]},
 'DeBiGa-globaldata': {'name': ['US Ecology Inc',
   'Mercury Systems Inc',
   'Xiwang Foodstuffs Co Ltd'],
  'headquarters': ['United States of America',
   

In [20]:
def save_attributes_to_json(attributes, output_file):
    # Scrive il dizionario degli attributi in un file JSON
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(attributes, file, ensure_ascii=False, indent=4)


In [22]:
save_attributes_to_json(result, "attributes_per_source.json")

## Indexing Phase for attributes matching with LLM 

In [23]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from groq import Groq

In [28]:
chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192")

In [27]:
str(result["DeBiGa-globaldata"])

"{'name': ['US Ecology Inc', 'Mercury Systems Inc', 'Xiwang Foodstuffs Co Ltd'], 'headquarters': ['United States of America', 'United States of America', 'China'], 'number_of_employees': ['3,600', '2,386', '1,892'], 'address': ['101 S Capitol Blvd Ste 1000, Boise, Idaho, 83702', '50 Minuteman Drive, Andover, Massachusetts, 01810', 'Xiwang Industrial Park, Zouping County, Binzhou, Shandong , 256209'], 'industry': ['Environmental and Waste Management Services', 'Aerospace and Defense', 'Consumer Packaged Goods'], 'website': ['www.usecology.com', 'www.mrcy.com', 'www.xwsp.cc'], 'market_cap': ['$2.7B', '$787.4M', '$2.7B'], 'telephone': ['1 208 3318400', '1 978 2561300', '86 543 4868888'], 'revenue': ['$988.0M', '$988.2M', '$985.2M']}"

In [29]:
response_dict = {}
for source, attributes in result.items():
    attributes_string = str(attributes)
    # Template 
    template = """You are an assistant who must help me to analyse the fields of a table.
        Here are the fields of the table each with 3 example values: {table}
        Based only on the values of each field, you have to give us a description in natural language of up to 7 words for each field. 
        All I want to output is a list of tuples Python like this:
        [(field title1,description1),
        (field title2,description2)]. 
        You have to return only the list of tuples without any messages.
        If one field is empty, not return the description for this field"""
    
    # Crea il prompt a partire dal template
    prompt = ChatPromptTemplate.from_messages([("human", template)])
    chain = prompt | chat
    response = chain.invoke({"table": attributes_string})
    response_dict[source] = response.content

In [30]:
response_dict

{'avengers-hitHorizons': "I'd be happy to help you with that. After analyzing the fields, here is the list of tuples with descriptions:\n\n[('id', 'Unique identifier for each record'), \n('name', 'Company name'), \n('address', 'Physical location of the company'), \n('nation', 'Country code'), \n('hhid', 'Unique identifier for household'), \n('industry', 'Type of business or sector'), \n('sic_code', 'Standard Industrial Classification code'), \n('type', 'Type of business entity'), \n('est_of_ownership', 'Year of establishment')]",
 'DeBiGa-globaldata': "I'd be happy to help you with that. Here is the list of tuples with descriptions for each field:\n\n[('name', 'Company names listed'), \n('headquarters', 'Countries of origin listed'), \n('number_of_employees', 'Employee counts listed'), \n('address', 'Company addresses listed'), \n('industry', 'Industry categories listed'), \n('website', 'Company websites listed'), \n('market_cap', 'Market capitalization values listed'), \n('telephone',

In [32]:
save_attributes_to_json(response_dict, "attributes_description.json")