# CLLM generation & curation tutorial

# Setup keys and experiment

In [1]:
import sys
sys.path.append('src/')

from cllm.utils import *
from cllm.llm_gen import *
from cllm.data_loader import *

import pandas as pd
from copy import deepcopy
import time
from sklearn.model_selection import train_test_split


#############################################################
# API KEY SETUP INSTRUCTIONS
#############################################################

# for vllm
# api_key = "EMPTY"
# api_base = "http://localhost:8000/v1"

# for together
# api_key = "add together api key"
# api_base = "https://api.together.xyz/v1"


# for azure openai
# api_key = "EMPTY"
# api_base = "add azure deployment link"

# for openai
# api_key = "EMPTY"
# api_base = DO NOT INCLUDE

#############################################################

api_details = {
     "api_base": "https://api.chatanywhere.tech",
     "api_version": "free_api",
     "api_key": "sk-L6xp4wATcmFSnJxCbsUcS0mIzN8AweFtxkKaKZ9VTwYLVe0q",
}


model_short_name = 'gpt-3.5-turbo' # 'gpt-4' (do not use other short names)
model = "gpt-3.5-turbo" # "gpt4_20230815" (use name of your model deployment)
llm_serving='together' # supported 'azure_openai', 'together', 'vllm'

seed = 0
ns = 20 # n_samples per class. e.g. if binary = 40 samples (i.e. 20 per class)
dataset = 'compas'
n_synthetic=10 # just to test --- normall should be 1000
n_processes = 1

# STEP 1: Generation

## Get dataset

In [2]:
df_feat, df_label, df = get_data(dataset=dataset, seed=seed)

X_train, X_remain, y_train, y_remain = sample_and_split(df_feat, df_label, ns=ns, seed=seed)

X_val, X_test, y_val, y_test = train_test_split(
    X_remain, y_remain, test_size=0.5, random_state=seed
)


X_train_orig = deepcopy(X_train)
y_train_orig = deepcopy(y_train)

  X, y, categorical_indicator, attribute_names = dataset.get_data(


## Setup Prompt

In [3]:
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser


response_schemas = []

example_df = pd.concat([X_train_orig, y_train_orig], axis=1)

# Shuffle
example_df = example_df.sample(frac=1).reset_index(drop=True)


for idx, col in enumerate(list(example_df.columns)):
    if col == 'y':
        resp = ResponseSchema(name='y',
                        description=f"binary label, {col}", )
    else:
        resp = ResponseSchema(name=col,
                        description=f"feature column", )
    response_schemas.append(resp)

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()



generator_template = """\
You are a synthetic data generator. 
Your goal is to produce data which mirrors \
the given examples in causal structure and feature and label distributions \
but also produces as diverse samples as possible

I will give you real examples first

Leverage your knowledge about criminal recividsm to generate 1000 realistic but diverse samples. 

example data: {data}

{format_instructions}

DO NOT COPY THE EXAMPLES but generate realistic but new and diverse samples which have the correct label conditioned on the features.
"""


prompt = ChatPromptTemplate.from_template(template=generator_template)

## Generate using LLM

In [4]:
retries = 4  # Max retries you want to attempt

while retries > 0:
    try:

        if len(example_df)>20:
            ic_samples=20
        else:
            ic_samples=len(example_df)
        
        print(f'Running {dataset}, {seed}, {model} --- {n_processes}')
        df_llm = llm_gen(prompt, generator_template, format_instructions, example_df, 
                        n_samples=n_synthetic,
                        temperature=0.9,
                        max_tokens=1000, model=model, 
                        n_processes=n_processes,
                        ic_samples=ic_samples, 
                        llm_serving=llm_serving, 
                        api_details=api_details)
    
        
        break  # if successful, break out of the loop
    except Exception as e:
        time.sleep(120)
        print(f"Error: {e}. Retrying with reduced n_processes...")
        n_processes = int(n_processes/2)
        retries -= 1
        if n_processes < 1:
            print("Error: Minimum n_processes reached. Exiting...")
            break
# try:
tmp_df = df_llm.astype(example_df.dtypes)
df_llm = tmp_df
# except:
#     pass

Running compas, 0, gpt-3.5-turbo --- 1
idx: 0
df_tmp    sex   age juv_fel_count juv_misd_count juv_other_count priors_count  \
0  0.0  38.0           0.0            1.0             0.0          2.0   
1  1.0  29.0           0.0            0.0             0.0          3.0   
2  1.0  44.0           0.0            0.0             0.0          6.0   
3  0.0  32.0           0.0            0.0             1.0          4.0   
4  1.0  27.0           0.0            0.0             0.0          2.0   

  age_cat_25-45 age_cat_Greaterthan45 age_cat_Lessthan25  \
0           1.0                   0.0                0.0   
1           1.0                   0.0                0.0   
2           1.0                   0.0                0.0   
3           1.0                   0.0                0.0   
4           1.0                   0.0                0.0   

  race_African-American race_Caucasian c_charge_degree_F c_charge_degree_M  y  
0                   1.0            0.0               1.0     

In [5]:
df_llm

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25-45,age_cat_Greaterthan45,age_cat_Lessthan25,race_African-American,race_Caucasian,c_charge_degree_F,c_charge_degree_M,y
0,0.0,38.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,1.0,29.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
2,1.0,44.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,0.0,32.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,1.0,27.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
5,0.0,35.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0
6,1.0,35.0,0.0,2.0,0.0,8.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
7,0.0,42.0,1.0,0.0,0.0,5.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1
8,1.0,30.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0
9,0.0,50.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0


## Process LLM generated data to have the same data types

In [6]:
df_llm = df_llm.dropna()
df_llm = df_llm[~df_llm.apply(lambda row: any([isinstance(cell, str) and cell in ['integer', 'float', 'numeric', 'categorical', 'number', 'No', 'Yes', 'continuous', 'age in years', 'string'] for cell in row]), axis=1)]

example_df = deepcopy(X_train_orig)
example_df['y'] = deepcopy(y_train_orig)

try:
    df_llm = df_llm.astype(example_df.dtypes)
except:
    # Assuming the dtypes from the example_df['Dtrain'].dataframe() is what you want
    target_dtypes = example_df.dtypes.to_dict()

    problematic_rows = set()

    for col, dtype in target_dtypes.items():
        for index, value in df[col].items():
            try:
                _ = dtype.type(value)  # Try to convert the value
            except Exception:
                problematic_rows.add(index)

    # Convert the problematic rows to a list and sort them
    problematic_rows = sorted(list(problematic_rows))

    # Drop the problematic rows
    df_llm.drop(problematic_rows, inplace=True)

    # Identify rows where any cell is of type list
    rows_with_lists = df.applymap(lambda x: isinstance(x, list)).any(axis=1)

    # Drop those rows
    df_llm = df_llm[~rows_with_lists]

    df_llm = df_llm.astype(example_df.dtypes)


df_llm

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25-45,age_cat_Greaterthan45,age_cat_Lessthan25,race_African-American,race_Caucasian,c_charge_degree_F,c_charge_degree_M,y
0,0.0,38.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,1.0,29.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
2,1.0,44.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,0.0,32.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,1.0,27.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
5,0.0,35.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0
6,1.0,35.0,0.0,2.0,0.0,8.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
7,0.0,42.0,1.0,0.0,0.0,5.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1
8,1.0,30.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0
9,0.0,50.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0


In [7]:
df_llm.dtypes

sex                      float32
age                      float32
juv_fel_count            float32
juv_misd_count           float32
juv_other_count          float32
priors_count             float32
age_cat_25-45            float32
age_cat_Greaterthan45    float32
age_cat_Lessthan25       float32
race_African-American    float32
race_Caucasian           float32
c_charge_degree_F        float32
c_charge_degree_M        float32
y                          int64
dtype: object

# STEP 2: Curation

In [None]:
from cllm.curation import data_centric_curation

X_check = df_llm.drop(columns=['y'])
y_check = df_llm['y'].values.astype(int)

curation_metric = 'aleatoric'
curation_ythresh=0.2
curation_xthresh=0 #adaptive


# easy_train：容易学习的样本的索引（ID），即模型高置信度且低不确定性的样本。
# ambig_train：模糊样本的索引（ID），即既不是容易学习也不是难以学习的样本，通常模型置信度中等或不确定性较高。
# unlearnable_train：难以学习的样本的索引（ID），即模型置信度低且不确定性低的样本，通常是噪声或异常点。
# Curator_xgb：Curator类的实例，内部保存了样本的置信度、不确定性等信息，可以用于进一步分析。
# curated_train_ids：将 easy_train 和 ambig_train 合并后的索引（ID），即经过筛选后认为可以用于训练的样本集合。
easy_train, ambig_train, unlearnable_train, Curator_xgb = data_centric_curation(X_train_orig, y_train_orig, X_check, y_check, 
                 curation_metric=curation_metric, retrain=False, nest = 100, 
                 curation_ythresh=curation_ythresh, curation_xthresh=curation_xthresh)

curated_train_ids = np.concatenate((easy_train, ambig_train))
curated_train_ids, unlearnable_train

Using adaptive threshold


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([], dtype=int64))