In [1]:
!pip install cbsodata
import cbsodata
import pandas as pd
import torch
from transformers import pipeline, BartTokenizer
from sentence_transformers import SentenceTransformer, util

Collecting cbsodata
  Downloading cbsodata-1.3.5-py3-none-any.whl.metadata (7.8 kB)
Downloading cbsodata-1.3.5-py3-none-any.whl (12 kB)
Installing collected packages: cbsodata
Successfully installed cbsodata-1.3.5


In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the summarizer and sentence transformer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Function to summarize a long description by chunking it
def summarize_long_description(description, max_length=1024, chunk_size=500):
    """Summarize long description by splitting into smaller chunks."""
    tokens = tokenizer.encode(description, truncation=False)
    if len(tokens) <= max_length:
        # If it's small enough, just summarize directly
        return summarizer(description, max_length=200, min_length=50, do_sample=False)[0]['summary_text']

    # Otherwise, split into chunks
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    summaries = []
    for chunk in chunks:
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    # Combine all chunk summaries into one
    return " ".join(summaries)

# Function to extract only the English table descriptions
def extract_english_descriptions(tables):
    descriptions = []
    for table in tables:
        # Process only English tables
        if table.get('Language', '').endswith('en'):
            # Main description of the table
            table_title = table.get('Title', 'Unknown Table Title').strip()
            table_description = table.get('ShortDescription', '').strip()
            descriptions.append({
                "table_title": table_title,
                "table_description": table_description,
                "Identifier": table.get('Identifier')
            })
    return descriptions

# Function to process and summarize table and column descriptions separately
def process_table_and_column_descriptions(descriptions):
    table_embeddings = []
    column_embeddings = []
    table_to_column_mapping = {}

    for desc in descriptions:
        #print(desc['Identifier'])
        table_id = desc['Identifier']
        table_title = desc['table_title']
        table_description = desc['table_description']
        try:
            # Summarize table description
            summarized_table_desc = summarize_long_description(table_description)
            combined_table_text = f"{table_title}: {summarized_table_desc}"  # Title + Summary

            # Retrieve column metadata
            metadata = pd.DataFrame(cbsodata.get_meta(table_id, name='DataProperties'))

            # Embed table description (title + description)
            table_embedding = embedding_model.encode(combined_table_text, convert_to_tensor=True)
            table_embeddings.append({"table_id": table_id, "embedding": table_embedding})

            # Process column titles and create embeddings
            column_titles = []
            for _, row in metadata.iterrows():
                try:
                    column_title = row.get('Title', '').strip()
                    # Skip if column title is empty or None
                    if not column_title:
                        continue
                    column_titles.append(column_title)
                except Exception as e:
                    print(f"Skipping column in table {table_id} due to error: {e}")

            # Embed column titles
            if column_titles:
                column_embedding = embedding_model.encode(column_titles, convert_to_tensor=True)
                column_embeddings.append({"table_id": table_id, "column_embeddings": column_embedding})
                table_to_column_mapping[table_id] = column_titles

        except Exception as e:
            print(f"Skipping table {table_id} due to error: {e}")

    return table_embeddings, column_embeddings, table_to_column_mapping

# Set display options for more text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# DATA EXPLORATION
tables = cbsodata.get_table_list()

# Extract English table descriptions
descriptions = extract_english_descriptions(tables)

# Process and summarize table and column metadata
table_embeddings, column_embeddings, table_to_column_mapping = process_table_and_column_descriptions(descriptions)

# Example Output
print("Example Table Embeddings:")
for table in table_embeddings[:3]:  # Display first 3 tables for brevity
    print(f"Table ID: {table['table_id']}")
    print(f"Table Embedding: {table['embedding']}")

print("\nExample Column Embeddings:")
for column in column_embeddings[:3]:  # Display first 3 tables for brevity
    print(f"Table ID: {column['table_id']}")
    print(f"Column Embeddings: {column['column_embeddings']}")

print("\nExample Table to Column Mapping:")
for table_id, columns in list(table_to_column_mapping.items())[:3]:  # Display first 3 mappings
    print(f"Table ID: {table_id}")
    print(f"Columns: {columns}")


In [3]:
# Function to extract only the English tables summary
def extract_english_summaries(tables):
    summaries = []
    for table in tables:
        # Process only English tables
        if table.get('Language', '').endswith('en'):
            # Summary of the table  + its identifier
            table_title = table.get('Title', '').strip()
            table_summary = table.get('Summary', '').strip()
            summaries.append({
                "table_title": table_title,
                "table_summary": table_summary,
                "Identifier": table.get('Identifier')
            })
    return summaries

def generate_summary_embeddings(summaries):
    summaries_embeddings = []
    for summary in summaries:
        table_id = summary['Identifier']
        table_title = summary['table_title']
        table_summary = summary['table_summary']
        try:
            combined_table_text = f"{table_title}: {table_summary}"  # Title + Summary

            # Embed table description (title + summary)
            summary_embedding = embedding_model.encode(combined_table_text, convert_to_tensor=True)
            summaries_embeddings.append({"table_id": table_id, "embedding": summary_embedding})
        except Exception as e:
            print(f"Skipping table {table_id} due to error: {e}")

    return summaries_embeddings

# Set display options for more text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

# Get a list of all the tables
tables = cbsodata.get_table_list()

# Extract English table summaries
summaries = extract_english_summaries(tables)

# Generate summaries embeddings
summaries_embeddings = generate_summary_embeddings(summaries)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Save to Google Drive
torch.save(summaries_embeddings, '/content/drive/My Drive/summaries_embeddings.pt')
torch.save(table_embeddings, '/content/drive/My Drive/table_embeddings.pt')
torch.save(column_embeddings, '/content/drive/My Drive/column_embeddings.pt')
torch.save(table_to_column_mapping, '/content/drive/My Drive/table_to_column_mapping.pt')
print("Embeddings and mappings saved to Google Drive!")


Mounted at /content/drive
Embeddings and mappings saved to Google Drive!
