In [7]:
import os
import re
import pickle
import numpy as np
from dotenv import dotenv_values
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.chat_models import ChatOpenAI
from tqdm import tqdm
from evaluate import load
import pandas as pd

In [8]:
# Load env file with API KEY using full path
config = dotenv_values(".env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]

In [None]:
topic_to_label = {
    "Music": ["Recording", "MusicRecordingName", "Duration", "ArtistName", "AlbumName"],
    "Restaurants" : ["RestaurantName", "PriceRange", "AddressRegion", "Country", "Telephone", 
                     "PaymentAccepted", "PostalCode",
                     "Coordinate", "DayOfWeek", "Time",
                     "RestaurantDescription", "Review"],
    "Hotels" : ["HotelName", "PriceRange", "Telephone",
                "FaxNumber", "Country", "Time",
                "PostalCode", "AddressLocality", "email",
                "LocationFeatureSpecification",
                "HotelDescription", "Review", "Rating",
                "PaymentAccepted", "Photograph"],
    "Events" : ["EventName", "Date", "DateTime",
                "EventStatusType", "EventDescription",
                "EventAttendanceModeEnumeration",
                "Organization", "Currency", "Telephone"]
}

In [9]:
labels_to_text = {
    "I dont'know": "I dont'know",
    "addressLocality": "locality of address",
    "postalCode": "postal code",
    "addressRegion": "region of address",
    "Country": "country",
    "priceRange": "price range",
    "Hotel/name": "name of hotel",
    "telephone": "telephone",
    "faxNumber": "fax number",
    "Date": "date",
    "Restaurant/name": "name of restaurant",
    "paymentAccepted": "payment accepted",
    "DayOfWeek": "day of week",
    "Review": "review",
    "Organization": "organization",
    "DateTime": "date and time",
    "MusicAlbum/name": "name of music album",
    "MusicArtistAT": "music artist",
    "MusicRecording/name": "name of music recording",
    "Photograph": "photograph",
    "CoordinateAT": "coordinate",
    "Event/name": "name of event",
    "EventAttendanceModeEnumeration": "event attendance mode",
    "EventStatusType": "event status",
    "currency": "currency",
    "email": "email",
    "Time": "time",
    "LocationFeatureSpecification": "location feature",
    "Duration": "duration",
    "Event/description": "description of event",
    "Restaurant/description": "description of restaurant",
    "Rating": "rating",
    "Hotel/description": "description of hotel"
}

In [10]:
# Dictionary to map ChatGPT answers to label set: synonyms can be added here
text_to_label = {
    "locality of address": "addressLocality",
    "postal code": "postalCode",
    "region of address": "addressRegion",
    "country": "Country",
    "price range": "priceRange",
    "name of hotel": "Hotel/name",
    "telephone": "telephone",
    "fax number": "faxNumber",
    "date": "Date",
    "name of restaurant": "Restaurant/name",
    "payment accepted": "paymentAccepted",
    "day of week": "DayOfWeek",
    "review": "Review",
    "organization": "Organization",
    "date and time": "DateTime",
    "music artist": "MusicArtistAT",
    "music album": "MusicAlbum/name",
    "name of music recording": "MusicRecording/name",
    "photograph": "Photograph",
    "coordinate": "CoordinateAT",
    "name of event": "Event/name",
    "event attendance mode": "EventAttendanceModeEnumeration",
    "event status": "EventStatusType",
    "currency": "currency",
    "email": "email",
    "time": "Time",
    "location feature": "LocationFeatureSpecification",
    "duration": "Duration",
    "description of event": "Event/description",
    "description of restaurant": "Restaurant/description",
    "description of hotel": "Hotel/description",
    "rating": "Rating",
    #Added
    "description of restaurants": "Restaurant/description",
    "name of music artist": "MusicArtistAT",
    "description of hotel amenities": "LocationFeatureSpecification",
    "amenities": "LocationFeatureSpecification",
    "name of album": "MusicAlbum/name",
    "i don't know": "-",
    "name of music album": "MusicAlbum/name",
    "music recording": "MusicRecording/name",
    "event name": "Event/name",
    "description of hotels": "Hotel/description",
    "name of hotels": "Hotel/name",
    "duration of music recording or video": "Duration",
    "name of organization": "Organization",
    "hotel amenities": "LocationFeatureSpecification",
    "amenities of hotel room": "LocationFeatureSpecification",
    "check-in time": "Time",
    "check-out time": "Time",
    "time of check-in": "Time",
    "time of check-out": "Time",
    "hotel features": "LocationFeatureSpecification",
    "name of aparthotel": "Hotel/name",
    "event description": "Event/description",
    "email address": "email",
    "room amenities": "LocationFeatureSpecification",
    "end date": "Date",
    "descriptions of events": "Event/description",
    "mode of attendance": "EventAttendanceModeEnumeration",
    "name of song": "MusicRecording/name"
}

## Load test set

In [11]:
with open('data/cta-test-table-wise.pkl', "rb") as f:
    test = pickle.load(f)

examples = [example[1] for example in test ]
labels = [l for example in test for l in example[2]]

In [12]:
print(len(examples))
print(len(labels))

41
250


In [41]:
topics = [example[3] for example in test]
print(topics)

['Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Restaurant', 'Event', 'Event', 'Event', 'Event', 'Event', 'Event', 'Event', 'Event', 'Event', 'Event', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'Hotel', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording', 'MusicRecording']


## Choose prompt template: without or with instructions

In [13]:
# New prompt
semantic_concept_template = """

Answer the question based on the task below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest a semantic concept for each column of a given table. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table: {input}

Semantic concepts:

"""

In [14]:
# Paper name: table + instructions
inst_template = """

Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "I don't know".

Task: Suggest a semantic concept for each column of a given table.

Instructions: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, suggest a semantic concept that best represents the meaning of all cells in the column. 4. Answer with the semantic concept for each column with the format Column1: semantic concept.

Table:
{input}

Semantic concepts:

"""

In [15]:
# prompt to ask gpt to classify labels 
classify_label_template = """

Task: Classify the semantic concept {input} with only one of the following classes that are separated with comma: description of event, description of restaurant, locality of address, postal code, region of address, country, price range, telephone, date, name of restaurant, payment accepted, day of week, review, organization, date and time, coordinate, name of event, event attendance mode, event status, currency, time, description of hotel, name of hotel, location feature, rating, fax number, email, photograph, name of music recording, music artist, name of album, duration.

Class:

"""

## Load LLM and run model

In [5]:
gpt_3_turbo = ChatOpenAI(model_name='gpt-3.5-turbo-0301', temperature=0)
gpt_4 = ChatOpenAI(model_name='gpt-4', temperature=0)

In [82]:
prompt_type = "semantic_concept"
if prompt_type == "semantic_concept":
    prompt = PromptTemplate(template=semantic_concept_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)
elif prompt_type == "with_inst":
    prompt = PromptTemplate(template=inst_template, input_variables=['input'])
    llm_chain = LLMChain(prompt=prompt, llm=gpt_3_turbo)
    llm_chain_4 = LLMChain(prompt=prompt, llm=gpt_4)

In [18]:
def save_preds(preds: list[str], file_name: str):
    #Save predictions in a file:
    with open(file_name,'wb') as f:
        pickle.dump(preds,f)

def load_preds(file_name: str):
    #Save predictions in a file:
    with open(file_name,'rb') as f:
        preds = pickle.load(f)
    return preds

In [83]:
#Zero-shot prediction
preds_gpt35 = [llm_chain.run({'input': example}) for example in examples]
save_preds(preds_gpt35, "predictions/gpt35-prompt-table-with-instructions.pkl")

In [84]:
preds_gpt4 = [llm_chain_4.run({'input': example}) for example in examples]
save_preds(preds_gpt4, "predictions/gpt4-prompt-table-with-instructions.pkl")

In [45]:
preds = load_preds("predictions/gpt4-prompt-table-with-instructions.pkl")
print(preds)

['Column 1: Phone Number, Column 2: Restaurant Name, Column 3: Zip Code, Column 4: State, Column 5: Country, Column 6: Longitude, Column 7: Latitude, Column 8: Opening Hours, Column 9: Days of Operation', 'Column 1: Phone Number, Column 2: Restaurant Name, Column 3: Restaurant Description, Column 4: Country Code, Column 5: Operating Days, Column 6: Opening Time', 'Column 1: Restaurant Name, Column 2: Postal Code, Column 3: Payment Methods, Column 4: State, Column 5: City', 'Column 1: Restaurant Name, Column 2: Contact Number, Column 3: Description, Column 4: City, Column 5: State, Column 6: Payment Methods, Column 7: Zip Code, Column 8: Longitude, Column 9: Latitude, Column 10: Opening Hours, Column 11: Days of Operation', 'Column 1: Restaurant Name, Column 2: Price Range, Column 3: Contact Number', 'Column 1: Restaurant Name, Column 2: Restaurant Phone Number, Column 3: Price Range, Column 4: Country Code, Column 5: Region, Column 6: Postal Code, Column 7: Longitude, Column 8: Latitud

In [26]:
df = pd.DataFrame(data={"raw_prompt_output": preds})
df.to_csv("predictions/gpt4-prompt-table-without-instructions.csv")

## Evaluation

In [99]:
prompt_classify = PromptTemplate(template=classify_label_template, input_variables=['input'])
llm_chain_c = LLMChain(prompt=prompt_classify, llm=gpt_3_turbo)
llm_chain_4_c = LLMChain(prompt=prompt_classify, llm=gpt_4)

In [103]:
llm_chain_c.run({'input': "state"})

'State can be classified as a semantic concept of "description of event".'

In [29]:
bertscore = load("bertscore")
predictions = ["hello there", "hello there"]
references = ["hello there", "general kenobi"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results)

{'precision': [1.000000238418579, 0.8224340677261353], 'recall': [1.000000238418579, 0.8144457340240479], 'f1': [1.000000238418579, 0.81842041015625], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.9(hug_trans=4.24.0)'}


In [46]:
def evaluation_new(preds, bert_threshold=0.85):
    ids, predictions, original_preds, highest_bertscores = [], [], [], []
    i=0
    for j, table_preds in enumerate(tqdm(preds)):
        # How many columns does the table have? : To control cases when less/more classes are returned
        table_number = len(test[j][2])
        
        if "Semantic concepts:" in table_preds:
            table_preds = table_preds.split("Class:")[1]
        
        #Break predictions into either \n or ,
        if ":" in table_preds or "-" in table_preds:
            if ":" in table_preds:
                separator = ":"
                start = 1
                end = table_number+1
            else:
                separator = "-"  
                start = 1
                end = table_number+1
        else:
            separator = ","
            start = 0
            end = table_number
            
        col_preds = table_preds.split(separator)[start:end]
        for pred in col_preds:
            i+=1
            
            # Remove break lines
            if "\n" in pred:
                pred = pred.split('\n')[0].strip()
            # Remove commas
            if "," in pred:
                pred = pred.split(",")[0].strip()
            # Remove paranthesis
            if '(' in pred:
                pred = pred.split("(")[0].strip()
            #Remove points
            if '.' in pred:
                pred = pred.split(".")[0].strip()
            # Lower-case prediction
            pred = pred.strip().lower()
            original_preds.append(pred)
            ids.append(j)

            classes = list(text_to_label.keys())
            bertscores = np.array(bertscore.compute(predictions=[pred] * len(classes), references=classes, lang="en")["f1"])
            index = np.argmax(bertscores)
            highest_score = np.max(bertscores)
            highest_bertscores.append(highest_score)

            if highest_score > bert_threshold:
                predictions.append(text_to_label[classes[index]])
            else:
                print(f"For test example {i} out of label space prediction: {pred}")
                predictions.append('-')
            
        # If more/less predictions for table
        if len(col_preds) < table_number:
            for m in range(0, table_number-len(col_preds)):
                predictions.append('-')
                i+=1
    return ids, predictions, original_preds, highest_bertscores
# class_predictions, original_preds = evaluation_new(preds_gpt35)
ids, class_predictions, original_preds, highest_bertscores = evaluation_new(preds)

100%|██████████| 41/41 [04:49<00:00,  7.06s/it]


In [47]:
df = pd.DataFrame({"prompt_output_id": ids, 
                   "label": labels, 
                   "parsed_col_pred": original_preds, 
                   "class_pred_using_bert": class_predictions, 
                   "highest_bertscore": highest_bertscores})
df.to_csv("predictions/preds_gpt4_with_inst.csv")

In [None]:
# Map predictions to label space
def evaluation_old():
    predictions = []
    i=0
    for j, table_preds in enumerate(preds):
        # How many columns does the table have? : To control cases when less/more classes are returned
        table_number = len(test[j][2])
        
        if "Class:" in table_preds:
            table_preds = table_preds.split("Class:")[1]
        
        #Break predictions into either \n or ,
        if ":" in table_preds or "-" in table_preds:
            if ":" in table_preds:
                separator = ":"
                start = 1
                end = table_number+1
            else:
                separator = "-"  
                start = 1
                end = table_number+1
        else:
            separator = ","
            start = 0
            end = table_number
            
        col_preds = table_preds.split(separator)[start:end]
        
        for pred in col_preds:
            i+=1
            
            # Remove break lines
            if "\n" in pred:
                pred = pred.split('\n')[0].strip()
            # Remove commas
            if "," in pred:
                pred = pred.split(",")[0].strip()
            # Remove paranthesis
            if '(' in pred:
                pred = pred.split("(")[0].strip()
            #Remove points
            if '.' in pred:
                pred = pred.split(".")[0].strip()
            # Lower-case prediction
            pred = pred.strip().lower()
            
            if pred in text_to_label:
                predictions.append(text_to_label[pred])
            else:
                print(f"For test example {i} out of label space prediction: {pred}")
                predictions.append('-')
            
        # If more/less predictions for table
        if len(col_preds) < table_number:
            for m in range(0, table_number-len(col_preds)):
                predictions.append('-')
                i+=1

evaluation_old()

### Calculate Precision, Recall, Macro-F1 and Micro-F1

In [68]:
def calculate_f1_scores(y_tests, y_preds, num_classes):
    types = list(set(labels))
    types = types + ["-"]
    
    y_tests = [types.index(y) for y in y_tests]
    y_preds = [types.index(y) for y in y_preds]
    
    #Confusion matrix
    cm = np.zeros(shape=(num_classes,num_classes))
    
    for i in range(len(y_tests)):
        cm[y_preds[i]][y_tests[i]] += 1
        
    report = {}
    
    for j in range(len(cm[0])):
        report[j] = {}
        report[j]['FN'] = 0
        report[j]['FP'] = 0
        report[j]['TP'] = cm[j][j]

        for i in range(len(cm)):
            if i != j:
                report[j]['FN'] += cm[i][j]
        for k in range(len(cm[0])):
            if k != j:
                report[j]['FP'] += cm[j][k]

        precision = report[j]['TP'] / (report[j]['TP'] + report[j]['FP'])
        recall = report[j]['TP'] / (report[j]['TP'] + report[j]['FN'])
        f1 = 2*precision*recall / (precision + recall)
        
        if np.isnan(f1):
            f1 = 0
        if np.isnan(precision):
            f1 = 0
        if np.isnan(recall):
            f1 = 0

        report[j]['p'] =  precision
        report[j]['r'] =  recall
        report[j]['f1'] = f1
    
    all_fn = 0
    all_tp = 0
    all_fp = 0

    for r in report:
        if r != num_classes-1:
            all_fn += report[r]['FN']
            all_tp += report[r]['TP']
            all_fp += report[r]['FP']
        
    class_f1s = [ report[class_]['f1'] for class_ in report]
    class_p = [ 0 if np.isnan(report[class_]['p']) else report[class_]['p'] for class_ in report]
    class_r = [ 0 if np.isnan(report[class_]['r']) else report[class_]['r'] for class_ in report]
    macro_f1 = sum(class_f1s[:-1]) / (num_classes-1)
    
    p =  sum(class_p[:-1]) / (num_classes-1)
    r =  sum(class_r[:-1]) / (num_classes-1)
    micro_f1 = all_tp / ( all_tp + (1/2 * (all_fp + all_fn) )) 
    
    per_class_eval = {}
    for index, t in enumerate(types[:-1]):
        per_class_eval[t] = {"Precision":class_p[index], "Recall": class_r[index], "F1": class_f1s[index]}
    
    evaluation = {
        "Micro-F1": micro_f1,
        "Macro-F1": macro_f1,
        "Precision": p,
        "Recall": r
    }
    
    return [ evaluation, per_class_eval]

In [88]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(labels, class_predictions, average="micro")

(0.56, 0.56, 0.56, None)

In [92]:
evaluation, per_class_eval = calculate_f1_scores(labels, class_predictions, 33)
print(evaluation)
print(per_class_eval)

{'Micro-F1': 0.52, 'Macro-F1': 0.444118705749798, 'Precision': 0.5183989507638804, 'Recall': 0.5050843253968254}
{'telephone': {'Precision': 1.0, 'Recall': 0.2, 'F1': 0.33333333333333337}, 'postalCode': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0}, 'Event/description': {'Precision': 0.5294117647058824, 'Recall': 1.0, 'F1': 0.6923076923076924}, 'priceRange': {'Precision': 0.6666666666666666, 'Recall': 0.8888888888888888, 'F1': 0.761904761904762}, 'MusicAlbum/name': {'Precision': 0, 'Recall': 0.0, 'F1': 0}, 'Time': {'Precision': 0.6428571428571429, 'Recall': 0.75, 'F1': 0.6923076923076924}, 'Organization': {'Precision': 0.0, 'Recall': 0.0, 'F1': 0}, 'DateTime': {'Precision': 0.8333333333333334, 'Recall': 0.5, 'F1': 0.625}, 'Restaurant/description': {'Precision': 0.75, 'Recall': 0.6, 'F1': 0.6666666666666665}, 'Photograph': {'Precision': 0, 'Recall': 0.0, 'F1': 0}, 'CoordinateAT': {'Precision': 1.0, 'Recall': 0.5, 'F1': 0.6666666666666666}, 'Country': {'Precision': 0.42857142857142855, 'R

  precision = report[j]['TP'] / (report[j]['TP'] + report[j]['FP'])
  f1 = 2*precision*recall / (precision + recall)
  recall = report[j]['TP'] / (report[j]['TP'] + report[j]['FN'])


## Error Analysis

In [None]:
# "-" means the model replied with out of label or with I don't know
errors = 0
for i in range(len(predictions)):
    if predictions[i] != labels[i]:
        errors += 1
        print(f"Predicted as {predictions[i]} when it was {labels[i]}")
errors