In [1]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import random

from openai import OpenAI

API_KEY = "your_api_key"
client = OpenAI(api_key=API_KEY, base_url="https://api.siliconflow.cn/v1")


In [2]:

# Load the dataset
iris = load_iris()

# Features and target
X = iris.data
y = iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (120, 4)
Test set size: (30, 4)


In [3]:
## Use the classical ML model of random forest for classification

# Initialize and train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nRandom Forest Accuracy:", accuracy)



Random Forest Accuracy: 0.9


In [4]:

N = len(X_test)
true_label = []
pred_label = []

for n in range(N):
    print("Predicting Test Example", n)

    
    # Here we construct the prompt for querying the LLM
    prompt = "Help me predict the Output value for the last Input. Your response should only contain the Output value in the format of #Output value#.\n"

    s = ""
    for i in np.arange(len(X_train)):
        s += f"Input: {X_train[i]}, Output: {y_train[i]}\n"
    s += f"Input: " + str(X_test[n]) + ", Output: "

    prompt += s
    # print(prompt)

    
    # Sometimes the LLM may not return our desired results. So, we try we try querying the LLM up to max_tries times. If still unsuccessful, we return a random label as prediction.
    max_tries = 5
    err_counter = 0
    while err_counter < max_tries:
        try:
            completion = client.chat.completions.create(
                # model='Qwen/Qwen2-1.5B-Instruct',
                model='Qwen/Qwen2-7B-Instruct',
                messages=[
                    {'role': 'user', 
                    'content': prompt}
                ],
                temperature=0
            )
            response = completion.choices[0].message.content
            pred = int(response.replace("#", ""))

            break
        except Exception as e:
            print(f"Error encountered: {e}. Retrying...")
            err_counter += 1

    if err_counter == max_tries:
        # if still unsuccessful after "max_tries" tries, return a random label
        print("max number of tries exceeded")
        pred = random.randint(0, 2)

    true_label.append(y_test[n])
    pred_label.append(pred)


Predicting Test Example 0
Predicting Test Example 1
Predicting Test Example 2
Predicting Test Example 3
Predicting Test Example 4
Predicting Test Example 5
Predicting Test Example 6
Predicting Test Example 7
Predicting Test Example 8
Predicting Test Example 9
Predicting Test Example 10
Predicting Test Example 11
Predicting Test Example 12
Predicting Test Example 13
Predicting Test Example 14
Predicting Test Example 15
Predicting Test Example 16
Predicting Test Example 17
Predicting Test Example 18
Predicting Test Example 19
Predicting Test Example 20
Predicting Test Example 21
Predicting Test Example 22
Predicting Test Example 23
Predicting Test Example 24
Predicting Test Example 25
Predicting Test Example 26
Predicting Test Example 27
Predicting Test Example 28
Predicting Test Example 29


In [5]:

print("Groundtrugh labels:")
print(true_label)
print("Predicted labels by ICL:")
print(pred_label)

accuracy = accuracy_score(true_label, pred_label)
print("\nICL Accuracy:", accuracy)


Groundtrugh labels:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]
Predicted labels by ICL:
[0, 2, 0, 1, 0, 2, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 0]

ICL Accuracy: 0.8666666666666667
