# Imports

Please run the cells in this section to download all packages, libraries, and dependencies.

In [None]:
!pip install openai
!pip install backoff
!pip install -q 'labelbox[data]'

Collecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [None]:
import os
import openai
import csv
import random
import pandas as pd
import numpy as np
import backoff  # for exponential backoff
import matplotlib.pyplot as plt
import labelbox

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import Counter
from labelbox import Client

# Set the environment variable
# Please set your own API key here
os.environ['OPENAI_API_KEY'] = ''
os.environ['LABELBOX_API_KEY'] = ''

# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")
labelbox.api_key = os.getenv("LABELBOX_API_KEY")

client = Client(labelbox.api_key)



# ETL

Please run the cells in this section to load the train and test files into dataframes and construct the randomly generated few-shot learning dataset, comprising of 100 tweet examples from the training dataset. The objective of the few-shot learning dataset (represented in the code below by the variable `selected_string`) is to pass this string as concrete examples for the LLM to learn from. This is the ultimate prompt that we will be engineering:

> Given the following tweets and their corresponding airlines, separated by new lines:
[INSERT FEW-SHOT LEARNING DATASET HERE]

> Please extract the airline(s) from the following tweet:
[INSERT TWEET HERE]

> Using the following format - ['#AIRLINE_NAME_1] for one airline or ['#AIRLINE_NAME_1, #AIRLINE_NAME_2...] for multiple airlines.

In [None]:
# Load your training and testing CSV files into Pandas DataFrames
import os
from google.colab import drive
from sklearn.model_selection import train_test_split
dataDir = '/content/drive/My Drive//Datasets/'
drive.mount('/content/drive')
train = os.path.join(dataDir, 'training_data.csv')
test = os.path.join(dataDir, 'test_data.csv')


df_train = pd.read_csv(train)  # Replace with the path to your training dataset
df_test = pd.read_csv(test)    # Replace with the path to your testing dataset

Mounted at /content/drive


In [None]:
# Read the CSV file and store rows in a list
csv_filename = train # Replace with your CSV filename
rows = []
with open(csv_filename, 'r', encoding='utf-8') as csvfile:
    csv_reader = csv.reader(csvfile)

    # Skip the first row (header)
    next(csv_reader)

    for row in csv_reader:
        if len(row) > 1:
            text = row[0]
            airline = row[1]
            rows.append(f"{text} {airline}")

# Set a seed for reproducibility (use any integer value you like)
seed_value = 68
random.seed(seed_value)

# Shuffle the lines randomly
random.shuffle(rows)

# Select X rows from the shuffled lines. This threshold can be increased as our the context window of our chosen model increases.
X = 100

# Replace with the number of rows you want to select
selected_rows = rows[:X]

# You can also access the selected rows as a single string by joining them with newlines
selected_string = '\n'.join(selected_rows)
print(selected_string)

569656873315487746 negative
568227390566109184 neutral
570307847281614848 positive
567826761141985280 positive
568140722240512001 negative
569929677214609408 negative
568208488335331329 negative
570002051108769792 negative
569990163209850881 neutral
569161513056935936 negative
569830069746712576 negative
569649433635368961 negative
567790926857531393 positive
568989687353729024 positive
568044610955681793 negative
568964010101514240 negative
567768450471587840 negative
569268918566719488 negative
568811854216105984 neutral
568458513636134913 negative
569459455273897984 negative
569641916855402496 negative
569791792029151232 negative
569886288691773440 negative
569657112109965312 negative
568899516872568832 positive
567735489688395776 positive
569485181633089536 negative
569312734896136193 positive
570264106059624448 neutral
569721804891271168 negative
569602069335838720 negative
568104264733573122 negative
568104907384832000 neutral
568162290890964992 positive
568037876761546753 positi

# Zero-Shot Prompt Engineering Benchmark

Please run the cells in this section to construct our baseline benchmark. The objective of this section is to evaluate the performance of various zero-shot prompts created via Labelbox to help us choose the best prompt for few-shot learning.

In [None]:
# Make sure the project_id represents a project from 'Humans Generate Prompts' option of the LLM Editor in Labelbox Annotate:
project_id = "clnlymt5y0ki2071f9g7mb3re"

project = client.get_project(project_id) # new project with skip

# Extract labels (i.e prompts created by humans) in the 'DONE' stage
filters = {"workflow_status": "Done"}

export_task = project.export_v2(filters=filters)
export_task.wait_till_done()

if export_task.errors:
  print(export_task.errors)

labels = export_task.result

In [None]:


# Define the labels list (mock structure for demonstration)
labels = [
    {
        "projects": {
            1: {
                "labels": [
                    {
                        "annotations": {
                            "classifications": [
                                {
                                    "text_answer": {
                                        "content": "What are the airlines in this tweet? '{{tweet}}'"
                                    }
                                }
                            ]
                        }
                    }
                ]
            }
        }
    }
    # Add more label entries if necessary
]

# Extract prompts from labels
lb_prompts = []
for label in labels:
    for project_id in label['projects']:
        prompt = label['projects'][project_id]["labels"][0]["annotations"]["classifications"][0]["text_answer"]["content"]
        prompt = prompt.replace('\n', '')
        lb_prompts.append(prompt)



# Format each prompt with the 'tweet' variable
zero_shot_prompts = [f_string.format(tweet=tweet) for f_string in lb_prompts]

# Print the results for verification
print(zero_shot_prompts)


["What are the airlines in this tweet? '{tweet}'"]


In [None]:
# Initialize an empty DataFrame to store the evaluation metrics
evaluation_metrics_df = pd.DataFrame(columns=['Prompt', 'Precision', 'Recall', 'F1-Score', 'Accuracy'])

for prompt_text in zero_shot_prompts:
    benchmark_results_df = pd.DataFrame(columns=['Ground Truth', 'Predictions'])

    for tweet, ground_truth in zip(df_test['tweet'][:5], df_test['airlines'][:5]):
        prompt = prompt_text.format(tweet=tweet)

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0,
            top_p=1,
            max_tokens=512  # Adjust max_tokens as needed
        )

        extracted_airlines = response['choices'][0]['message']['content'].strip('"')

        # Append the results to the DataFrame
        benchmark_results_df = benchmark_results_df.append({
            'Ground Truth': ground_truth,
            'Predictions': extracted_airlines
        }, ignore_index=True)

    # Calculate evaluation metrics for this prompt
    precision = precision_score(
        benchmark_results_df['Ground Truth'],
        benchmark_results_df['Predictions'],
        average='micro'
    )

    recall = recall_score(
        benchmark_results_df['Ground Truth'],
        benchmark_results_df['Predictions'],
        average='micro'
    )

    f1 = f1_score(
        benchmark_results_df['Ground Truth'],
        benchmark_results_df['Predictions'],
        average='micro'
    )

    accuracy = accuracy_score(
        benchmark_results_df['Ground Truth'],
        benchmark_results_df['Predictions']
    )

    # Append the metrics to the evaluation DataFrame
    evaluation_metrics_df = evaluation_metrics_df.append({
        'Prompt': prompt_text,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Accuracy': accuracy
    }, ignore_index=True)

# Print the evaluation metrics
print(evaluation_metrics_df)

In [None]:
evaluation_metrics_df

In [None]:
evaluation_metrics_df.to_csv('evaluation_metrics_zero_shot_prompts.csv', index=False)

# Few-Shot Prompt Engineering with Context

Please run the cells in this section to combine the few-shot dataset that we constructed in the ETL section along with the "best" prompt that we found in the Zero-Shot Prompt Engineering Benchmark Section. The objective of this section is to evaluate the performance of few-shot learning on the test set of tweets.

In [None]:
# Initialize an empty DataFrame to store the ground truth and predictions from the LLM based on the few-shot prompt that was passed to it
few_shot_prompt_engineering_results_df = pd.DataFrame(columns=['Ground Truth', 'Predictions'])

for tweet, ground_truth in zip(df_test['tweet'], df_test['airlines']):
    prompt=f"Given the following tweets and their corresponding airlines, separated by new lines:\n\n{selected_string}\n\nPlease extract the airline(s) from the following tweet:\n\n{tweet}\n\nUsing the following format - ['#AIRLINE_NAME_1] for one airline or ['#AIRLINE_NAME_1, #AIRLINE_NAME_2...] for multiple airlines."

    response = openai.ChatCompletion.create(
        # model="gpt-3.5-turbo-16k",
        model="gpt-3.5-turbo",
        messages=[
          {
            "role": "user",
            "content": prompt
          }
        ],
        temperature=0,
        top_p=1,
        max_tokens=400  # Adjust max_tokens as needed,
    )

    predicted_airlines = response['choices'][0]['message']['content'].strip('"')

    # Append the results to the DataFrame
    few_shot_prompt_engineering_results_df = few_shot_prompt_engineering_results_df.append({
        'Ground Truth': ground_truth,
        'Predictions': predicted_airlines
    }, ignore_index=True)

# Save the results DataFrame to a CSV file
few_shot_prompt_engineering_results_df.to_csv('few_shot_prompt_engineering_results_df.csv', index=False)

In [None]:
# Print out our ground truth vs. predictions dataframe
few_shot_prompt_engineering_results_df

# [VISUALIZATIONS] - Pie Chart

Please run the code cell below to visualize the distribution of the airline predictions made by the LLM on the test set.

In [None]:
# Extract and flatten the 'Predictions' column
predictions_pie_chart_fewshot = [category for category in few_shot_prompt_engineering_results_df['Predictions']]

# Count the occurrences of each category
prediction_counts_pie_chart_fewshot = dict(Counter(predictions_pie_chart_fewshot))

# Sort the prediction_counts dictionary by values in descending order
sorted_prediction_counts_piechart_fewshot = dict(sorted(prediction_counts_pie_chart_fewshot.items(), key=lambda item: item[1], reverse=True))

# Extract labels and sizes
labels = list(sorted_prediction_counts_piechart_fewshot.keys())
sizes = list(sorted_prediction_counts_piechart_fewshot.values())

# Define a threshold for explosion (e.g., 1%)
threshold = 1.0

# Calculate explosion values based on the threshold
explode = [0.4 if (size / sum(sizes) * 100) < threshold else 0.0 for size in sizes]

# Create the pie chart with explode
plt.figure(figsize=(16, 16))  # Adjust the figure size as needed
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=200, explode=explode)
plt.title('Distribution of Categories in Predictions')
plt.legend(title='Categories', loc='upper left', labels=labels, ncol=2, bbox_to_anchor=(1, 1), borderaxespad=0.5)  # Set the number of columns as needed
plt.axis('equal')
plt.show()

# [VISUALIZATIONS] - Bar Chart

Please run the code cell below to visualize the distribution of the airline predictions made by the LLM on the test set, juxtaposed by the ground truths.

In [None]:
# Sample data (replace with your own data)
predictions = [category for category in few_shot_prompt_engineering_results_df['Predictions']]
ground_truth = [category for category in few_shot_prompt_engineering_results_df['Ground Truth']]

# Combine unique values from both predictions and ground truth
unique_values = list(set(predictions + ground_truth))

# Count the occurrences of each category for predictions and ground truth
prediction_counts = Counter(predictions)
ground_truth_counts = Counter(ground_truth)

# Initialize lists to store counts for unique values
unique_prediction_counts = []
unique_ground_truth_counts = []

# Populate the lists with counts, ensuring zeros for missing values
for value in unique_values:
    unique_prediction_counts.append(prediction_counts.get(value, 0))
    unique_ground_truth_counts.append(ground_truth_counts.get(value, 0))

# Sort the lists by descending counts
sorted_data = sorted(zip(unique_values, unique_prediction_counts, unique_ground_truth_counts), key=lambda x: x[1], reverse=True)
unique_values, unique_prediction_counts, unique_ground_truth_counts = zip(*sorted_data)

# Set the width of the bars
bar_width = 0.25

# Create an array of indices for the x-axis
x = np.arange(len(unique_values))

# Create the figure and axis
fig, ax = plt.subplots(figsize=(20, 12))

# Plot predictions and ground truth side by side
bar1 = ax.bar(x - bar_width / 2, unique_prediction_counts, bar_width, label='Predictions')
bar2 = ax.bar(x + bar_width / 2, unique_ground_truth_counts, bar_width, label='Ground Truth')

# Set the x-axis labels and title
ax.set_xticks(x)
ax.set_xticklabels(unique_values, rotation=90)
ax.set_xlabel('Categories')
ax.set_ylabel('Counts')
ax.set_title('Unique Categories Comparison between Predictions and Ground Truth')

# Add a legend
ax.legend()

# Show the plot
plt.tight_layout()
plt.show()

# Evaluation Metrics & Misclassification Identification

Please run the code cell below to calculate the precision, recall, f1-score, and accuracy for the Twitter tweets on the test set.

Also used to identify the misclassifications.

In [None]:
# Initialize empty sets for accumulating labels
all_true_labels = set()
all_predicted_labels = set()

# Create an empty list to store misclassifications
misclassifications = []

# Iterate through each row in the DataFrame and accumulate labels
for index, row in few_shot_prompt_engineering_results_df.iterrows():
    true_labels = row['Ground Truth']
    predicted_labels = row['Predictions']
    all_true_labels.update(true_labels)
    all_predicted_labels.update(predicted_labels)

    # Check if the true and predicted labels do not match
    if set(true_labels) != set(predicted_labels):
        misclassifications.append(row)

# Define the custom evaluation function
def custom_evaluation(true_set, predicted_set):
    true_set = set(true_set)
    predicted_set = set(predicted_set)

    # Precision: Intersection of true and predicted labels divided by predicted labels
    precision = len(true_set.intersection(predicted_set)) / len(predicted_set) if len(predicted_set) > 0 else 1.0

    # Recall: Intersection of true and predicted labels divided by true labels
    recall = len(true_set.intersection(predicted_set)) / len(true_set) if len(true_set) > 0 else 1.0

    # F1-score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 1.0

    # Accuracy: Intersection of true and predicted labels divided by the total number of labels
    accuracy = len(true_set.intersection(predicted_set)) / len(true_set.union(predicted_set)) if len(true_set.union(predicted_set)) > 0 else 1.0

    return precision, recall, f1, accuracy

# Calculate custom evaluation metrics for the entire DataFrame
precision, recall, f1, accuracy = custom_evaluation(all_true_labels, all_predicted_labels)

# Print the metrics for the entire DataFrame
print(f"Custom Precision (Overall): {precision}")
print(f"Custom Recall (Overall): {recall}")
print(f"Custom F1-score (Overall): {f1}")
print(f"Custom Accuracy (Overall): {accuracy}")

# Print the misclassified rows
print("Misclassified Rows:")
for row in misclassifications:
    print(row)
    print('\n')

# results_df.to_csv('results_df.csv', index=False)

# Fine-Tuning

Please run the cells in this section to create a fine-tuned OpenAI Model, using the same 100 examples that we provided as part of our few-shot learning approach in the sections above.

combine the few-shot dataset that we constructed in the ETL section along with the "best" prompt that we found in the **Few-Shot Prompt Engineering with Context Section**. The objective of this section is to evaluate the performance of fine-tuning on the test set of tweets.

In [None]:
# Upload your training file
'''
The training file should be in the following format:

{"messages": [{"role": "system", "content": "Given the following tweet, please extract the airline(s) from the tweet as a list of string(s)."},{"role": "user", "content": "American Air Thanks, but that results in missing the conference I'm attending. Are there options to book earlier, or if not, receive a refund?"},{"role": "assistant", "content": "['American Airlines']"}]}
{"messages": [{"role": "system", "content": "Given the following tweet, please extract the airline(s) from the tweet as a list of string(s)."},{"role": "user", "content": "@JetBlue Headphone jack not working on my flight."},{"role": "assistant", "content": "['JetBlue Airways']"}]}
{"messages": [{"role": "system", "content": "Given the following tweet, please extract the airline(s) from the tweet as a list of string(s)."},{"role": "user", "content": "@JetBlue Landed at MCO before 9am and still don't have my bag. You were supposed to give it to Disney's Magical Express this AM. I am livid!"},{"role": "assistant", "content": "['JetBlue Airways']"}]}
...
'''
file_upload_response = openai.File.create(file=open("twitter-context.jsonl"), purpose='fine-tune')

In [None]:
# List uploaded files and their results
openai.File.list()

In [None]:
file_id = file_upload_response.id
fine_tuned_model = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo-0613")

In [None]:
# List fine-tuned jobs and their results
openai.FineTuningJob.list(limit=10)

In [None]:
'''
Please run this cell to avoid rate limit & service unavailable errors.

The function below automatically retries requests to the OpenAI servers with a random exponential backoff.
Retrying with exponential backoff means performing a short sleep when a rate limit error is hit, then retrying the unsuccessful request.
If the request is still unsuccessful, the sleep length is increased and the process is repeated.
This continues until the request is successful or until a maximum number of retries is reached.
'''

# Please see OpenAI documentation here for further details: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

@backoff.on_exception(backoff.expo, openai.error.ServiceUnavailableError)
def completions_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [None]:
# Initialize an empty DataFrame to store the ground truth and predictions from the LLM based on the few-shot prompt that was passed to it
fine_tuning_results_df = pd.DataFrame(columns=['Ground Truth', 'Predictions'])

for tweet, ground_truth in zip(df_test['tweet'], df_test['airlines']):
    prompt=f"Given the following tweet, please extract the airline(s) from the tweet as a list of string(s). {tweet}"

    response = completions_with_backoff(
        model="ft:gpt-3.5-turbo-0613:personal::7xiiKWfL",
        messages=[
          {
            "role": "user",
            "content": prompt
          }
        ],
        temperature=0,
        top_p=1,
        max_tokens=400  # Adjust max_tokens as needed,
    )


    predicted_airlines = response['choices'][0]['message']['content'].strip('"')

    # Append the results to the DataFrame
    fine_tuning_results_df = fine_tuning_results_df.append({
        'Ground Truth': ground_truth,
        'Predictions': predicted_airlines
    }, ignore_index=True)

# Save the results DataFrame to a CSV file
fine_tuning_results_df.to_csv('fine_tuning_results_df.csv', index=False)

In [None]:
fine_tuning_results_df

In [None]:
# Initialize empty sets for accumulating labels
all_true_labels = set()
all_finedtuned_predicted_labels = set()

# Create an empty list to store misclassifications
misclassifications = []

# Iterate through each row in the DataFrame and accumulate labels
for index, row in fine_tuning_results_df.iterrows():
    true_labels = row['Ground Truth']
    predicted_labels = row['Predictions']
    all_true_labels.update(true_labels)
    all_finedtuned_predicted_labels.update(predicted_labels)

    # Check if the true and predicted labels do not match
    if set(true_labels) != set(predicted_labels):
        misclassifications.append(row)

# Define the custom evaluation function
def custom_evaluation(true_set, predicted_set):
    true_set = set(true_set)
    predicted_set = set(predicted_set)

    # Precision: Intersection of true and predicted labels divided by predicted labels
    precision = len(true_set.intersection(predicted_set)) / len(predicted_set) if len(predicted_set) > 0 else 1.0

    # Recall: Intersection of true and predicted labels divided by true labels
    recall = len(true_set.intersection(predicted_set)) / len(true_set) if len(true_set) > 0 else 1.0

    # F1-score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 1.0

    # Accuracy: Intersection of true and predicted labels divided by the total number of labels
    accuracy = len(true_set.intersection(predicted_set)) / len(true_set.union(predicted_set)) if len(true_set.union(predicted_set)) > 0 else 1.0

    return precision, recall, f1, accuracy

# Calculate custom evaluation metrics for the entire DataFrame
precision, recall, f1, accuracy = custom_evaluation(all_true_labels, all_finedtuned_predicted_labels)

# Print the metrics for the entire DataFrame
print(f"Custom Precision (Overall): {precision}")
print(f"Custom Recall (Overall): {recall}")
print(f"Custom F1-score (Overall): {f1}")
print(f"Custom Accuracy (Overall): {accuracy}")

# Print the misclassified rows
print("Misclassified Rows:")
for row in misclassifications:
    print(row)
    print('\n')

# results_df.to_csv('results_df.csv', index=False)