In [2]:
# Function to read queries from a text file
def read_queries_from_file(file_path):
    with open(file_path, 'r') as file:
        queries = file.readlines()
    return [query.strip() for query in queries]  # Remove any extra spaces/newlines

# Load queries from the text files
acl_queries = read_queries_from_file('acl_queries.txt')
databricks_queries = read_queries_from_file('databricks_queries.txt')

# Print the first few queries to verify
print("ACL Queries:")
print(acl_queries[:5])

print("Databricks Queries:")
print(databricks_queries[:5])


ACL Queries:
['SET DATE "YYYY-MM-DD"', 'ASSIGN V_IM_START_TIME = DATETIME(CTODT(DATETIME(),"YYYY-MM-DD HH:MM:SS"))', '', '', 'ACCESSDATA64 CONNECTOR NAME "SQL Server" USER "%V_CCM_USER%" PASSWORD 1 TO "%V_INT%MASTER2.FIL" CHARMAX 50 MEMOMAX 100']
Databricks Queries:
['SET DATE "YYYY-MM-DD"', 'ASSIGN V_IM_START_TIME = DATETIME(CTODT(DATETIME(),"YYYY-MM-DD HH:MM:SS"))', '', '', 'COMMENT--OBJ0001--Databricks connection through exceptions as follow']


In [3]:
import re

# Function to preprocess queries (remove special characters, convert to lowercase)
def preprocess_query(query):
    query = query.lower()  # Convert to lowercase
    query = re.sub(r'[^a-zA-Z0-9\s]', '', query)  # Remove special characters
    return query

# Preprocess both ACL and Databricks queries
acl_queries_cleaned = [preprocess_query(query) for query in acl_queries]
databricks_queries_cleaned = [preprocess_query(query) for query in databricks_queries]

# Print the cleaned queries to verify
print("Cleaned ACL Queries:")
print(acl_queries_cleaned[:5])

print("Cleaned Databricks Queries:")
print(databricks_queries_cleaned[:5])


Cleaned ACL Queries:
['set date yyyymmdd', 'assign vimstarttime  datetimectodtdatetimeyyyymmdd hhmmss', '', '', 'accessdata64 connector name sql server user vccmuser password 1 to vintmaster2fil charmax 50 memomax 100']
Cleaned Databricks Queries:
['set date yyyymmdd', 'assign vimstarttime  datetimectodtdatetimeyyyymmdd hhmmss', '', '', 'commentobj0001databricks connection through exceptions as follow']


In [4]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate BERT embeddings for a query
def generate_embedding(query):
    inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooling for sentence embedding

# Generate embeddings for both ACL and Databricks queries
acl_embeddings = [generate_embedding(query) for query in acl_queries_cleaned]
databricks_embeddings = [generate_embedding(query) for query in databricks_queries_cleaned]

# Print embeddings for verification
print("Generated ACL Embeddings:")
print(acl_embeddings[:1])  # Print first embedding to verify

print("Generated Databricks Embeddings:")
print(databricks_embeddings[:1])  # Print first embedding to verify


Generated ACL Embeddings:
[array([-4.44574468e-02,  3.21494997e-01,  5.56582510e-01, -3.82406525e-02,
        6.21535182e-01,  1.35313630e-01,  3.10807496e-01,  2.64955044e-01,
        2.25264400e-01, -2.18643829e-01, -6.57071918e-02, -1.22886457e-01,
        1.92013532e-01,  5.49592137e-01, -2.93860197e-01,  3.69930267e-01,
       -1.59301609e-01,  3.31122205e-02, -3.62049878e-01,  1.36520416e-01,
        2.55369186e-01, -2.52296865e-01,  1.06925711e-01, -2.63041735e-01,
       -1.47457179e-02,  4.05996740e-02,  2.61020929e-01,  3.07770193e-01,
       -1.59958795e-01, -1.55036554e-01, -1.11600515e-02, -5.17390311e-01,
       -6.94468543e-02, -2.34705284e-01, -3.45626175e-01,  2.99639195e-01,
       -3.58992040e-01,  2.14597613e-01, -1.22384623e-01,  1.08887456e-01,
        1.31415650e-01, -3.96625966e-01, -1.91993132e-01,  3.15431178e-01,
       -5.66011429e-01, -2.74609953e-01, -2.73577690e-01,  2.90551968e-02,
        2.15916708e-01, -3.82215142e-01, -1.37623832e-01,  4.16664556e-02

In [14]:
import chromadb

# Initialize ChromDB client
client = chromadb.Client()

# Check if the collection exists, otherwise fetch it
try:
    collection = client.get_collection("query_mappings")
    print("Collection found.")
except chromadb.errors.CollectionNotFoundError:
    print("Collection not found. Creating a new collection.")
    collection = client.create_collection("query_mappings")

# Add matched query pairs to the collection
for idx, match_idx in enumerate(best_matches):
    acl_query = acl_queries[idx]
    databricks_query = databricks_queries[match_idx]
    collection.add(
        documents=[acl_query],
        metadatas=[{"databricks_query": databricks_query}],
        ids=[f"acl_to_databricks_{idx}"]
    )

print("Successfully inserted new embeddings into the collection.")


Add of existing embedding ID: acl_to_databricks_0
Insert of existing embedding ID: acl_to_databricks_0


Collection found.


Add of existing embedding ID: acl_to_databricks_1
Insert of existing embedding ID: acl_to_databricks_1
Add of existing embedding ID: acl_to_databricks_2
Insert of existing embedding ID: acl_to_databricks_2
Add of existing embedding ID: acl_to_databricks_3
Insert of existing embedding ID: acl_to_databricks_3
Add of existing embedding ID: acl_to_databricks_4
Insert of existing embedding ID: acl_to_databricks_4
Add of existing embedding ID: acl_to_databricks_5
Insert of existing embedding ID: acl_to_databricks_5
Add of existing embedding ID: acl_to_databricks_6
Insert of existing embedding ID: acl_to_databricks_6
Add of existing embedding ID: acl_to_databricks_7
Insert of existing embedding ID: acl_to_databricks_7
Add of existing embedding ID: acl_to_databricks_8
Insert of existing embedding ID: acl_to_databricks_8
Add of existing embedding ID: acl_to_databricks_9
Insert of existing embedding ID: acl_to_databricks_9
Add of existing embedding ID: acl_to_databricks_10
Insert of existing emb

Successfully inserted new embeddings into the collection.


In [22]:
import ollama

# Function to read the ACL query from a text file
def read_acl_query_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()  # Read the query and remove any leading/trailing whitespace

# Define the file path for your ACL query text file
acl_query_file_path = "/home/aravinds/directory_env/acl_queries.txt"

# Read the ACL query from the file
query_to_convert = read_acl_query_from_file(acl_query_file_path)

# Query ChromDB to get relevant mappings based on the ACL query
results = collection.query(query_texts=[query_to_convert], n_results=5)

# Extract documents and metadatas
documents = results['documents'][0]  # List of ACL queries
metadatas = results['metadatas'][0]  # List of Databricks queries

# Prepare context from the stored mappings
context = "\n".join([f"ACL: {doc}\nDatabricks SQL: {meta['databricks_query']}" for doc, meta in zip(documents, metadatas)])

# Clear and direct prompt for Ollama model
prompt = f"""
You are an expert in SQL conversions. Your task is to convert ACL queries into equivalent Databricks SQL queries. 

Here are some examples of previous conversions:
{context}

Now, convert the following ACL query to Databricks SQL:

ACL Query:
{query_to_convert}

Only provide the Databricks SQL query as output. Do not include any explanations or internal thoughts.
"""

# Use Ollama model to generate a Databricks SQL query
response = ollama.chat(model="deepseek-r1", messages=[{"role": "user", "content": prompt}],options={"temperature": 0.1})

# Extract and print the generated Databricks SQL query
generated_query = response['message']['content']
print("Generated Databricks SQL Query:\n", generated_query)


Generated Databricks SQL Query:
 <think>
Alright, I need to convert an ACL query into a Databricks SQL query. Let me start by understanding both the source and target formats.

First, looking at the provided examples helps. The user has shown several conversions where they've translated ACL syntax into equivalent Databricks SQL statements. This gives me a clear idea of how certain constructs are mapped.

The given ACL query is quite extensive. It includes setting dates, assigning variables, defining access data, and then performing a complex join operation with multiple tables. There's also a WHERE clause that filters based on specific date ranges and customer types.

I notice that in the examples, the main differences between ACL and Databricks SQL are:

1. **Set Date Statements**: In ACL, `SET DATE` is used to set system dates, whereas in Databricks, you can use functions like `DATETIME()` or `CTODT()`. The example shows how these are translated.

2. **Variable Assignments**: The `AS

In [None]:
/home/aravinds/directory_env/acl_queries.txt


Complex dataset generated with 500+ examples!
