In [None]:
import os
import re
import pickle
import numpy as np
from dotenv import dotenv_values
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.chat_models import ChatOpenAI
from tqdm import tqdm
from evaluate import load
import pandas as pd

In [None]:
# Load env file with API KEY using full path
config = dotenv_values("../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]

In [None]:
labels_to_text = {
    "I dont'know": "I dont'know",
    "addressLocality": "locality of address",
    "postalCode": "postal code",
    "addressRegion": "region of address",
    "Country": "country",
    "priceRange": "price range",
    "Hotel/name": "name of hotel",
    "telephone": "telephone",
    "faxNumber": "fax number",
    "Date": "date",
    "Restaurant/name": "name of restaurant",
    "paymentAccepted": "payment accepted",
    "DayOfWeek": "day of week",
    "Review": "review",
    "Organization": "organization",
    "DateTime": "date and time",
    "MusicAlbum/name": "name of music album",
    "MusicArtistAT": "music artist",
    "MusicRecording/name": "name of music recording",
    "Photograph": "photograph",
    "CoordinateAT": "coordinate",
    "Event/name": "name of event",
    "EventAttendanceModeEnumeration": "event attendance mode",
    "EventStatusType": "event status",
    "currency": "currency",
    "email": "email",
    "Time": "time",
    "LocationFeatureSpecification": "location feature",
    "Duration": "duration",
    "Event/description": "description of event",
    "Restaurant/description": "description of restaurant",
    "Rating": "rating",
    "Hotel/description": "description of hotel"
}

In [None]:
# Dictionary to map ChatGPT answers to label set: synonyms can be added here
text_to_label = {
    "locality of address": "addressLocality",
    "postal code": "postalCode",
    "region of address": "addressRegion",
    "country": "Country",
    "price range": "priceRange",
    "name of hotel": "Hotel/name",
    "telephone": "telephone",
    "fax number": "faxNumber",
    "date": "Date",
    "name of restaurant": "Restaurant/name",
    "payment accepted": "paymentAccepted",
    "day of week": "DayOfWeek",
    "review": "Review",
    "organization": "Organization",
    "date and time": "DateTime",
    "music artist": "MusicArtistAT",
    "music album": "MusicAlbum/name",
    "name of music recording": "MusicRecording/name",
    "photograph": "Photograph",
    "coordinate": "CoordinateAT",
    "name of event": "Event/name",
    "event attendance mode": "EventAttendanceModeEnumeration",
    "event status": "EventStatusType",
    "currency": "currency",
    "email": "email",
    "time": "Time",
    "location feature": "LocationFeatureSpecification",
    "duration": "Duration",
    "description of event": "Event/description",
    "description of restaurant": "Restaurant/description",
    "description of hotel": "Hotel/description",
    "rating": "Rating",
    #Added
    "description of restaurants": "Restaurant/description",
    "name of music artist": "MusicArtistAT",
    "description of hotel amenities": "LocationFeatureSpecification",
    "amenities": "LocationFeatureSpecification",
    "name of album": "MusicAlbum/name",
    "i don't know": "-",
    "name of music album": "MusicAlbum/name",
    "music recording": "MusicRecording/name",
    "event name": "Event/name",
    "description of hotels": "Hotel/description",
    "name of hotels": "Hotel/name",
    "duration of music recording or video": "Duration",
    "name of organization": "Organization",
    "hotel amenities": "LocationFeatureSpecification",
    "amenities of hotel room": "LocationFeatureSpecification",
    "check-in time": "Time",
    "check-out time": "Time",
    "time of check-in": "Time",
    "time of check-out": "Time",
    "hotel features": "LocationFeatureSpecification",
    "name of aparthotel": "Hotel/name",
    "event description": "Event/description",
    "email address": "email",
    "room amenities": "LocationFeatureSpecification",
    "end date": "Date",
    "descriptions of events": "Event/description",
    "mode of attendance": "EventAttendanceModeEnumeration",
    "name of song": "MusicRecording/name"
}

## Load test set

In [None]:
with open('data/cta-test-table-wise.pkl', "rb") as f:
    test = pickle.load(f)

examples = [example[1] for example in test]
labels = [l for example in test for l in example[2]]
topics = [example[3] for example in test]

In [None]:
print(len(examples))
print(len(labels))

## Choose prompt template: without or with instructions

In [None]:
# Paper name: table
original_template = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Classify the columns of a given table with only one of the following classes that are separated with comma: description of event, description of restaurant, locality of address, postal code, region of address, country, price range, telephone, date, name of restaurant, payment accepted, day of week, review, organization, date and time, coordinate, name of event, event attendance mode, event status, currency, time, description of hotel, name of hotel, location feature, rating, fax number, email, photograph, name of music recording, music artist, name of album, duration.

Table: {input}

Class:

"""

# Paper name: table + instructions
original_inst_template = """

Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Classify the columns of a given table with only one of the following classes that are separated with comma: description of event, description of restaurant, locality of address, postal code, region of address, country, price range, telephone, date, name of restaurant, payment accepted, day of week, review, organization, date and time, coordinate, name of event, event attendance mode, event status, currency, time, description of hotel, name of hotel, location feature, rating, fax number, email, photograph, name of music recording, music artist, name of album, duration.

Instructions: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, select a class that best represents the meaning of all cells in the column. 4. Answer with the selected class for each columns with the format Column1: class.

Table:
{input}

Class:

"""

In [None]:
# Modified original
modified_original_template = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Classify the columns of a given table with only one of the following classes that are separated with comma: description of event, description of restaurant, locality of address, postal code, region of address, country, price range, telephone, date, name of restaurant, payment accepted, day of week, review, organization, date and time, coordinate, name of event, event attendance mode, event status, currency, time, description of hotel, name of hotel, location feature, rating, fax number, email, photograph, name of music recording, music artist, name of album, duration. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table: {input}

Class:

"""

In [None]:
# New prompt
semantic_concept_template = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest a semantic concept for each column of a given table. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table: {input}

Semantic concepts:

"""

In [None]:
# Paper name: table + instructions
inst_template = """

Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest a semantic concept for each column of a given table.

Instructions: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, suggest a semantic concept that best represents the meaning of all cells in the column. 4. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table:
{input}

Semantic concepts:

"""

In [None]:
# prompt to ask gpt to classify labels 
classify_label_template = """

Task: Classify the semantic concept {input} with only one of the following classes that are separated with comma: description of event, description of restaurant, locality of address, postal code, region of address, country, price range, telephone, date, name of restaurant, payment accepted, day of week, review, organization, date and time, coordinate, name of event, event attendance mode, event status, currency, time, description of hotel, name of hotel, location feature, rating, fax number, email, photograph, name of music recording, music artist, name of album, duration.

Class:

"""

In [None]:
debug_template = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest a semantic concept for each column of a given table. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table: {input}

Semantic concepts:

"""

debug_template_top_5 = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest 5 possible semantic concept for each column of a given table. Answer with the semantic concept for each column with the format Column1: possible semantic concept 1, possible semantic concept 2, possible semantic concept 3, possible semantic concept 4, possible semantic concept 5. 

Table: {input}

Semantic concepts:

"""

debug_template_album = """

Could this column be about music albums?

Table: {input}

"""

check_template = """

Critique whether these semantic concepts matches their respective columns in the given table and improve on them. If there is no further improvements to be made, just say 'It's good'.

Semantic concepts: {prev_output}

Table: {input}

Answer with the semantic concept for each column with the format Column1: semantic concept. 

Semantic concepts:

"""

check_template_best_of_5 = """

Critique whether these semantic concepts matches their respective columns in the given table and improve on them. Choose the best semantic concept for each column.

Semantic concepts: {prev_output}

Table: {input}

Answer with the semantic concept for each column with the format Column1: semantic concept. 

Semantic concepts:

"""

## Load LLM and run model

In [None]:
gpt_3_turbo = ChatOpenAI(model_name='gpt-3.5-turbo-0301', temperature=0)
gpt_4 = ChatOpenAI(model_name='gpt-4-0613', temperature=0)

In [None]:
prompt_type = "modified_original_template"
if prompt_type == "original_template":
    prompt = PromptTemplate(template=original_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "modified_original_template":
    prompt = PromptTemplate(template=modified_original_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "semantic_concept":
    prompt = PromptTemplate(template=semantic_concept_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "with_inst":
    prompt = PromptTemplate(template=inst_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "debug_template":
    prompt = PromptTemplate(template=debug_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "debug_template_top_5":
    prompt = PromptTemplate(template=debug_template_top_5, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)

In [None]:
improve_prompt = PromptTemplate(template=check_template_best_of_5, input_variables=['input', 'prev_output'])
llm_chain_improve = LLMChain(prompt=improve_prompt, llm=gpt_3_turbo)

In [None]:
def convert_to_column_major(example: str) -> str:
    lines = example.split("\n")
    col_major = [col + ": " for col in lines[0].split("||")]
    for line in lines[1:]:
        for i, val in enumerate(line.split("||")):
            col_major[i] += val + ", "
    debug_eg = "\n".join(col_major[:-1])
    return debug_eg

print(convert_to_column_major(examples[40]))

In [None]:
def save_preds(preds: list[str], file_name: str):
    #Save predictions in a file:
    with open(file_name,'wb') as f:
        pickle.dump(preds,f)

def load_preds(file_name: str):
    #Save predictions in a file:
    with open(file_name,'rb') as f:
        preds = pickle.load(f)
    return preds

In [None]:
#Zero-shot prediction
preds_gpt35 = [llm_chain.run({'input': example}) for example in examples]
save_preds(preds_gpt35, "predictions/gpt35-prompt-table-without-instructions-og-modified.pkl")

In [None]:
#Zero-shot prediction
preds_gpt35 = [llm_chain.run({'input': convert_to_column_major(example)}) for example in examples]
save_preds(preds_gpt35, "predictions/gpt35-prompt-table-without-instructions-col-major.pkl")

In [None]:
preds_gpt4 = [llm_chain_4.run({'input': example}) for example in examples]
save_preds(preds_gpt4, "predictions/gpt4-prompt-table-with-instructions.pkl")

## Evaluation

In [None]:
bertscore = load("bertscore")
preds = preds_gpt35 # OR load from one of the pickle files previously saved

In [None]:
def evaluation_new(preds, bert_threshold=0.85):
    ids, predictions, original_preds, parsed_preds, top_5_predictions, highest_bertscores = [], [], [], [], [], []
    i=0
    for j, table_preds in enumerate(tqdm(preds)):
        # How many columns does the table have? : To control cases when less/more classes are returned
        table_number = len(test[j][2])
        
        if "Semantic concepts:" in table_preds:
            table_preds = table_preds.split("Class:")[1]
        
        #Break predictions into either \n or ,
        if ":" in table_preds or "-" in table_preds:
            if ":" in table_preds:
                separator = ":"
                start = 1
                end = table_number+1
            else:
                separator = "-"  
                start = 1
                end = table_number+1
        else:
            separator = ","
            start = 0
            end = table_number
            
        col_preds = table_preds.split(separator)[start:end]
        for pred in col_preds:
            i+=1
            
            # Remove break lines
            if "\n" in pred:
                pred = pred.split('\n')[0].strip()
            # Remove commas
            if "," in pred:
                pred = pred.split(",")[0].strip()
            # Remove paranthesis
            if '(' in pred:
                pred = pred.split("(")[0].strip()
            #Remove points
            if '.' in pred:
                pred = pred.split(".")[0].strip()
            # Lower-case prediction
            pred = pred.strip().lower()
            parsed_preds.append(pred)
            original_preds.append(table_preds)
            ids.append(j)

            classes = list(text_to_label.keys())
            bertscores = np.array(bertscore.compute(predictions=[pred] * len(classes), references=classes, lang="en")["f1"])
            index = np.argsort(bertscores)[-5:]
            max_index = index[-1]
            top_5_predictions.append([text_to_label[classes[i]] for i in index])
            highest_bertscores.append(bertscores[index])

            highest_score = bertscores[max_index]

            if highest_score > bert_threshold:
                predictions.append(text_to_label[classes[max_index]])
            else:
                print(f"For test example {i} out of label space prediction: {pred}")
                predictions.append('-')
            
        # If more/less predictions for table
        if len(col_preds) < table_number:
            for m in range(0, table_number-len(col_preds)):
                original_preds.append(table_preds)
                ids.append(j)
                predictions.append("-")
                parsed_preds.append("-")
                top_5_predictions.append([])
                highest_bertscores.append(0)
                i+=1
    return ids, predictions, original_preds, parsed_preds, highest_bertscores, top_5_predictions

ids, class_predictions, original_preds, parsed_preds, highest_bertscores, top_5_preds = evaluation_new(preds)

In [None]:
len(ids), len(class_predictions), len(original_preds), len(highest_bertscores)

In [None]:
df = pd.DataFrame({"prompt_output_id": ids, 
                   "label": labels, 
                   "original_pred": original_preds, 
                   "parsed_col_pred": parsed_preds, 
                   "class_pred_using_bert": class_predictions, 
                   "highest_bertscore": highest_bertscores,
                   "top_5_preds": top_5_preds})
df.to_csv("predictions/preds_gpt35_with_inst_og_modified.csv")

In [None]:
# Map predictions to label space
def evaluation_old(preds):
    # Map predictions to label space
    predictions = []
    i=0
    for j, table_preds in enumerate(preds):
        # How many columns does the table have? : To control cases when less/more classes are returned
        table_number = len(test[j][2])
        
        if "Class:" in table_preds:
            table_preds = table_preds.split("Class:")[1]
        
        #Break predictions into either \n or ,
        if ":" in table_preds or "-" in table_preds:
            if ":" in table_preds:
                separator = ":"
                start = 1
                end = table_number+1
            else:
                separator = "-"  
                start = 1
                end = table_number+1
        else:
            separator = ","
            start = 0
            end = table_number
            
        col_preds = table_preds.split(separator)[start:end]
        
        for pred in col_preds:
            i+=1
            
            # Remove break lines
            if "\n" in pred:
                pred = pred.split('\n')[0].strip()
            # Remove commas
            if "," in pred:
                pred = pred.split(",")[0].strip()
            # Remove paranthesis
            if '(' in pred:
                pred = pred.split("(")[0].strip()
            #Remove points
            if '.' in pred:
                pred = pred.split(".")[0].strip()
            # Lower-case prediction
            pred = pred.strip().lower()
            
            if pred in text_to_label:
                predictions.append(text_to_label[pred])
            else:
                print(f"For test example {i} out of label space prediction: {pred}")
                predictions.append('-')
            
        # If more/less predictions for table
        if len(col_preds) < table_number:
            for m in range(0, table_number-len(col_preds)):
                predictions.append('-')
                i+=1
    return predictions

old_preds = evaluation_old(preds)

In [None]:
# Could potentially use GPT to match open world label to classification label?
prompt_classify = PromptTemplate(template=classify_label_template, input_variables=['input'])
llm_chain_c = LLMChain(prompt=prompt_classify, llm=gpt_3_turbo)
llm_chain_4_c = LLMChain(prompt=prompt_classify, llm=gpt_4)
llm_chain_c.run({'input': "state"})

### Calculate Precision, Recall, Macro-F1 and Micro-F1

In [None]:
df = pd.read_csv("predictions/preds_gpt35_without_inst_og_modified.csv",index_col=0)
labels, preds = df["label"], df["class_pred_using_bert"]
df.head()

In [None]:
print(len(df))
print(len(df[df["label"] == df["class_pred_using_bert"]]))
df['label_in_top_5'] = df[['label','top_5_preds']].apply(
    lambda row: row['label'] in row['top_5_preds'], axis=1
)
print(len(df[df["label_in_top_5"] == True]))

In [None]:
def calculate_f1_scores(y_tests, y_preds, num_classes):
    types = list(set(labels))
    types = types + ["-"]
    
    y_tests = [types.index(y) for y in y_tests]
    y_preds = [types.index(y) for y in y_preds]
    
    #Confusion matrix
    cm = np.zeros(shape=(num_classes,num_classes))
    
    for i in range(len(y_tests)):
        cm[y_preds[i]][y_tests[i]] += 1
        
    report = {}
    
    for j in range(len(cm[0])):
        report[j] = {}
        report[j]['FN'] = 0
        report[j]['FP'] = 0
        report[j]['TP'] = cm[j][j]

        for i in range(len(cm)):
            if i != j:
                report[j]['FN'] += cm[i][j]
        for k in range(len(cm[0])):
            if k != j:
                report[j]['FP'] += cm[j][k]

        precision = report[j]['TP'] / (report[j]['TP'] + report[j]['FP'])
        recall = report[j]['TP'] / (report[j]['TP'] + report[j]['FN'])
        f1 = 2*precision*recall / (precision + recall)
        
        if np.isnan(f1):
            f1 = 0
        if np.isnan(precision):
            f1 = 0
        if np.isnan(recall):
            f1 = 0

        report[j]['p'] =  precision
        report[j]['r'] =  recall
        report[j]['f1'] = f1
    
    all_fn = 0
    all_tp = 0
    all_fp = 0

    for r in report:
        if r != num_classes-1:
            all_fn += report[r]['FN']
            all_tp += report[r]['TP']
            all_fp += report[r]['FP']
        
    class_f1s = [ report[class_]['f1'] for class_ in report]
    class_p = [ 0 if np.isnan(report[class_]['p']) else report[class_]['p'] for class_ in report]
    class_r = [ 0 if np.isnan(report[class_]['r']) else report[class_]['r'] for class_ in report]
    macro_f1 = sum(class_f1s[:-1]) / (num_classes-1)
    
    p =  sum(class_p[:-1]) / (num_classes-1)
    r =  sum(class_r[:-1]) / (num_classes-1)
    micro_f1 = all_tp / ( all_tp + (1/2 * (all_fp + all_fn) )) 
    
    per_class_eval = {}
    for index, t in enumerate(types[:-1]):
        per_class_eval[t] = {"Precision":class_p[index], "Recall": class_r[index], "F1": class_f1s[index]}
    
    evaluation = {
        "Micro-F1": micro_f1,
        "Macro-F1": macro_f1,
        "Precision": p,
        "Recall": r
    }
    
    return evaluation, per_class_eval

In [None]:
evaluation, per_class_eval = calculate_f1_scores(labels, preds, 33)
print(evaluation)
print(per_class_eval)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(labels, preds, average="micro")

## Error Analysis

In [None]:
df = pd.read_csv("predictions/preds_gpt35_without_inst.csv",index_col=0)
df.head()

In [None]:
error_df = df[df["lionel_annot"] != df["label"]]
error_df["table"] = error_df.apply(lambda row: examples[row["prompt_output_id"]], axis=1)
error_df["all_labels"] = error_df.apply(lambda row: test[row["prompt_output_id"]][2], axis=1)

In [None]:
error_df.reset_index(inplace=True, drop=True)
error_df.head()

In [None]:
for idx in range(len(error_df)):
    print("="*10)
    print(f"Table {error_df.loc[idx, 'prompt_output_id']}")
    print(error_df.loc[idx,"table"])
    print(error_df.loc[idx,"all_labels"])
    
    print(f"Ground truth: {error_df.loc[idx,'label']}")
    print(f"Raw output: {error_df.loc[idx,'parsed_col_pred']}")
    print(f"Annot: {error_df.loc[idx,'lionel_annot']}")