In [17]:
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
import json

import google.generativeai as genai
import os
import time
import sqlite3
import logging
import mysql.connector
from collections import Counter
import math

In [18]:
#setting up LLMs

load_dotenv()

#gpt
api_key_gpt=os.getenv('CHATGPT_API_KEY')
client = OpenAI(
  api_key=api_key_gpt,
)

#gemini
genai.configure(api_key=os.environ["GOOGLE_API_KEY_FLASH"])
modelGemini=genai.GenerativeModel('gemini-1.5-flash')

#llama
api_key_llama=os.getenv("LLAMA_API_KEY")
client = OpenAI(
    api_key = api_key_llama,
    base_url = "https://api.llama-api.com"
)

In [19]:
def printResponse(prompt,llm):
    # print(f"User:\n{prompt}\n\n")
    if llm=='gpt':
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-3.5-turbo-0125",
        )
        response=chat_completion.choices[0].message.content
        # print(f"LLM:\n{response}\n\n")
        return response
    elif llm=='gemini':
        time.sleep(3)
        response = modelGemini.generate_content(prompt)
        # print(f"LLM:\n{response.text}\n\n")
        return response.text
    elif llm=='llama':
        response = client.chat.completions.create(
            model="llama-13b-chat",
            messages=[
                {"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
                {"role": "user", "content": prompt}
            ]
        )
        response=response.choices[0].message.content
        # print(f"LLM:\n{response}\n\n")
        return response

In [20]:

def execute_query(database_path, query):
    try:
        conn = sqlite3.connect(database_path)
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        conn.close()
        return results
    except sqlite3.OperationalError as e:
        print(f"An error occurred: {e}")
        with open('./incorrectGeminiLog.txt', 'a') as f:
            f.write(f"\nError that occured:\n{e}\n\n")
        return None

def get_first_row_with_columns(database_path, table_name):
    try:
        conn = sqlite3.connect(database_path)
        cursor = conn.cursor()
        
        # Get column names
        cursor.execute(f"PRAGMA table_info({table_name})")
        columns = [info[1] for info in cursor.fetchall()]
        
        # Get the first row
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 1")
        first_row = cursor.fetchone()
        
        conn.close()
        
        return columns, first_row
    except sqlite3.OperationalError as e:
        print(f"An error occurred: {e}")
        with open('./incorrectGeminiLog.txt', 'a') as f:
            f.write(f"\nError that occurred:\n{e}\n\n")
        return None, None

In [21]:

def getDbSchemaMapping(dbFolderPath):
    count = 0
    schema_array = {}
    for folder in os.listdir(dbFolderPath):
        count += 1
        folder_path = os.path.join(dbFolderPath, folder)
        if os.path.exists(folder_path):
            json_data = None
            table_info = []
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.json'):
                    json_file_path = os.path.join(folder_path, file_name)
                    with open(json_file_path, 'r', encoding='utf-8') as file:
                        try:
                            json_data = json.load(file)
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON in file {json_file_path}: {e}")
                            continue
            
            db_file_path = os.path.join(folder_path, f"{folder}.sqlite")
            if json_data:
                if 'tables' in json_data:
                    final_table_info = {}
                    for table in json_data['tables']:
                        table_name = table['name']
                        columns, first_row = get_first_row_with_columns(db_file_path, table_name)
                        table_info.append((table_name, columns, first_row))
                    
                        
                    schema_array[folder] = {
                        "schema": json_data,
                        "table_info": table_info
                    }
                else:
                    print(f"'tables' key not found in JSON data for folder: {folder_path}")
                    schema_array[folder] = {
                        "schema": json_data,
                        "table_info": table_info
                    }
            else:
                print(f"JSON file not found for folder: {folder_path}")
    print(count)
    return schema_array

In [22]:
def get_data(tableJsonPath,cleanDataPath):

    with open(tableJsonPath,"r") as f:
        tables_data=json.load(f)

    database = {}

    with open(cleanDataPath, 'r') as f:

        for line in f:
            data = json.loads(line)
            db_id = data.get('db_id')
            query = data.get('query')
            question = data.get('question')
            query_toks=data.get('query_toks')

            word_freq={}        
            for item in tables_data:
                if item['db_id']==db_id:
                    for table_name in item["table_names_original"]:
                        word_freq[table_name.lower()]=1

            if db_id not in database:
                database[db_id] = {'query': [], 'question': [], 'query_toks': [], 'tables': []}

            interim_map={}
            table_list=[]
            for query_tok in query_toks:
                if query_tok.lower() in word_freq:
                    if query_tok.lower() not in interim_map:
                        interim_map[query_tok.lower()]=1
                        table_list.append(query_tok.lower())
            
            database[db_id]['query'].append(query)
            database[db_id]['question'].append(question)
            database[db_id]['query_toks'].append(query_toks)
            database[db_id]['tables'].append(table_list)
            

    return database

In [None]:
schema_array = getDbSchemaMapping('F:/OneDrive/Desktop/Study/NLP_ResearchProject/Project/spider/database')
databases = get_data('F:/OneDrive/Desktop/Study/NLP_ResearchProject/Project/spider/tables.json','F:/OneDrive/Desktop/Study/NLP_ResearchProject/Project/spider/train_spider_main_data.json')

In [24]:
hard_dataset = []
easy_dataset = []

for folder, data in schema_array.items():
    schema = data['schema']
    table_info = data['table_info']
    length = len(table_info)

    if length >= 6:
        hard_dataset.append(folder)
    else:
        easy_dataset.append(folder)



In [25]:
#initialise cross prompt variiables (DANGER: DO NOT PRESS)
totalQueries=0
correctAns=0
notCorrectAns=0
file_path = 'F:/OneDrive/Desktop/Study/NLP_ResearchProject/Project/incorrectGeminiLogTableExtraction.txt'

with open(file_path, 'w') as file:
    pass
file_path = 'F:/OneDrive/Desktop/Study/NLP_ResearchProject/Project/geminiLogTableExtraction.txt'
with open(file_path, 'w') as file:
    pass

In [26]:
with open('database_table_extract_questions.json', 'r') as json_file:
    database_table_questions = json.load(json_file)

with open('database_table_extract.json', 'r') as json_file:
    database_table_description = json.load(json_file)

In [27]:
for folder, tables in database_table_description.items():
    # Create a new dictionary with keys converted to lower case
    tables_lower = {table.lower(): table_description for table, table_description in tables.items()}
    # Replace the original dictionary with the modified one
    database_table_description[folder] = tables_lower

In [None]:
for folder, tables in database_table_questions.items():
    if len(database_table_description[folder]) <= 5:
        continue
    print(f"Folder is {folder}")
    print(f"Number of Tables are {len(database_table_description[folder])}")

In [None]:
# Unsupervised learning chinook_1 college_1
model_used = 'gemini'
threshold = 0.5

from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer("all-MiniLM-L6-v2")

db_table_extraction_list = {}
total_num_correct = 0
total_num_incorrect = 0
database_selected_tables = {}

for folder, tables in database_table_questions.items():
    
    
    tables_descriptions = database_table_description[folder]
    # if len(database_table_description[folder]) <= 5:
    #     continue
    print(f"Folder: {folder}")
    num_questions = len(databases[folder]['question'])
    num_correct = 0
    num_incorrect = 0
    tables_and_questions = {}
    for i in range(num_questions):
        question = databases[folder]['question'][i]
        question_embedding = embedder.encode(question, convert_to_tensor=True)
        table_similarities = {}

        for table_name, questions in tables.items():
            if table_name not in tables_descriptions:
                continue  # Skip this iteration if the table_name key doesn't exist in tables_descriptions
            table_description = tables_descriptions[table_name]
            similarity_val = 0
            descriptor_embedding = embedder.encode(table_description, convert_to_tensor=True)
            similarity_val_desc = util.pytorch_cos_sim(question_embedding, descriptor_embedding)
            similarity_val2 = similarity_val_desc.item()
            for question2 in questions:
                question2_embedding = embedder.encode(question2, convert_to_tensor=True)
                similarity = util.pytorch_cos_sim(question_embedding, question2_embedding)
                similarity_val = max(similarity_val, similarity.item())
            similarity_val = similarity_val + similarity_val2          
            
            table_similarities[table_name.lower()] = similarity_val
        print(f"Question: {question}")
        print(f"Table Similarities: {table_similarities}")
        

        num_elements = len(table_similarities)
        

        
        try:
            average_score = sum(table_similarities.values()) / len(table_similarities)
        except ZeroDivisionError:
            average_score = 0  # or any other default value or action
        selected_tables = []
        
        sorted_tables = sorted(table_similarities.items(), key=lambda item: item[1], reverse=True)
        cnt = 0
        prev = 0
        thres = 50
        flg = False
        flg2 = True
        for table_name, score in sorted_tables:  # Corrected this line
            cnt = cnt + 1
            if cnt > 1 and flg2 == True:
                thres = min(thres, (prev/score))

            if cnt > 4:
                flg2 = False
                if not flg:
                    thres = prev/score
                    flg = True
                else:
                    if (prev/score) > thres:
                        break
                    thres = (prev/score)
            selected_tables.append(table_name)
            prev = score
        
            
        print(f"Selected Tables: {selected_tables}")
        tables_and_questions[i] = selected_tables
        
        table_list = databases[folder]['tables'][i]
        query = databases[folder]['query'][i]
        print(f"Table List: {table_list}")
        print(f"Query: {query}")
        
        # Check if all tables in table_list are present in selected_tables
        if all(table in selected_tables for table in table_list):
            print("Correct")
            num_correct += 1
            total_num_correct += 1
        else:
            print("Incorrect")
            num_incorrect += 1
            total_num_incorrect += 1
        print("\n")
    database_selected_tables[folder] = tables_and_questions
    
    percent = (num_correct / num_questions) * 100
    print(f"{folder} has this much accuracy {percent}")
    db_table_extraction_list[folder] = {
        "Accuracy": percent,
        "Correct": num_correct,
        "Incorrect": num_incorrect
    }
    print("\n")

try:
    total_percent = (total_num_correct / (total_num_correct + total_num_incorrect)) * 100
except ZeroDivisionError:
    total_percent = 0

print(f"Total Percent Accuracy: {total_percent}")

    
    
        
# Select the tables with similarity scores above the average score / 75% quartile
        

In [None]:
print(len(databases['icfp_1']['question']))

In [None]:
folder_tables = {}
for folder, accuracy in db_table_extraction_list.items():
    print(f"Folder: {folder}")
    print(f"Accuracy: {accuracy['Accuracy']}")
    

In [32]:


with open('selected_tables.json', 'w') as json_file:
    json.dump(database_selected_tables, json_file, indent = 4)

In [None]:
true_positives = 0
false_positives = 0
false_negatives = 0
overall_metric_values = {}

for folder, val in database_selected_tables.items():
    if folder in easy_dataset:
        continue
    database_true_positives = 0
    database_false_positives = 0
    database_false_negatives = 0
    for i, tables in val.items():
        tables_selected = set(tables)  # Assuming tables is a list, convert it to a set for efficient operations
        original_table = set(databases[folder]['tables'][i])  # Similarly, ensure original_table is a set

        # Calculate true positives, false positives, and false negatives
        database_true_positives += len(tables_selected.intersection(original_table))
        database_false_positives += len(tables_selected - original_table)
        database_false_negatives += len(original_table - tables_selected)
    database_recall = database_true_positives / (database_true_positives + database_false_negatives) if (database_true_positives + database_false_negatives) > 0 else 0
    database_precision = database_true_positives / (database_true_positives + database_false_positives) if (database_true_positives + database_false_positives) > 0 else 0
    dic = {
    "recall": database_recall ,
    "precision": database_precision,
    "accuracy": db_table_extraction_list[folder]["Accuracy"]
}
    overall_metric_values[folder] = dic


    true_positives += database_true_positives
    false_negatives += database_false_negatives
    false_positives += database_false_positives


# Calculate recall and precision
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

print(f"Recall: {recall * 100}")
print(f"Precision: {precision * 100}")

In [34]:
with open("./strategy_3.json", "w") as f:
    json.dump(overall_metric_values, f, indent=4)

In [None]:

for folder, values in overall_metric_values.items():
    print(f"Folder: {folder}")
    if isinstance(values['precision'], list):
        precision = sum([value * 100 for value in values['precision']]) / len(values['precision'])
    else:
        precision = values['precision'] * 100
    print(f"Precision: {precision}%")
    print(f"Recall: {values['recall'] * 100}%")
    print(f"Accuracy: {values['accuracy']}%")
    print("\n")


In [36]:
with open("final_metric_values.json", "w") as json_file:
    json.dump(overall_metric_values, json_file, indent=4)

In [37]:
hard_table_extraction_percent_accuracy = {}
easy_table_extraction_percent_accuracy = {}

for folder, accuracy in db_table_extraction_list.items():
    if folder in hard_dataset:
        hard_table_extraction_percent_accuracy[folder] = accuracy
    else:
        easy_table_extraction_percent_accuracy[folder] = accuracy

with open("hard_table_extraction_percent_accuracy.json", "w") as f:
    json.dump(hard_table_extraction_percent_accuracy, f, indent=4)
with open("easy_table_extraction_percent_accuracy.json", "w") as f:
    json.dump(easy_table_extraction_percent_accuracy, f, indent=4)

hard_num_correct = 0
hard_num_incorrect = 0
hard_percent = 0

for folder, accuracy in hard_table_extraction_percent_accuracy.items():
    hard_num_correct += accuracy['Correct']
    hard_num_incorrect += accuracy['Incorrect']
hard_percent = (hard_num_correct / (hard_num_correct + hard_num_incorrect)) * 100

easy_num_correct = 0
easy_num_incorrect = 0
easy_percent = 0

for folder, accuracy in easy_table_extraction_percent_accuracy.items():
    easy_num_correct += accuracy['Correct']
    easy_num_incorrect += accuracy['Incorrect']
easy_percent = (easy_num_correct / (easy_num_correct + easy_num_incorrect)) * 100
hard_size = len(hard_table_extraction_percent_accuracy)
easy_size = len(easy_table_extraction_percent_accuracy)
percent = {
    "Hard": hard_percent,
    "Easy": easy_percent,
    "total": total_percent,
    "Hard Size": hard_size,
    "Easy Size": easy_size,
    "hard_num_correct": hard_num_correct,
    "hard_num_incorrect": hard_num_incorrect,
    "easy_num_correct": easy_num_correct,
    "easy_num_incorrect": easy_num_incorrect
}

with open("table_extraction_percent_question.json", "w") as f:
    json.dump(percent, f, indent=4)