# CLLM generation & curation tutorial

# Setup keys and experiment

In [1]:
import sys
sys.path.append('src/')

from cllm.utils import *
from cllm.llm_gen import *
from cllm.data_loader import *

import pandas as pd
from copy import deepcopy
import time
from sklearn.model_selection import train_test_split


#############################################################
# API KEY SETUP INSTRUCTIONS
#############################################################

# for vllm
# api_key = "EMPTY"
# api_base = "http://localhost:8000/v1"

# for together
# api_key = "add together api key"
# api_base = "https://api.together.xyz/v1"


# for azure openai
# api_key = "EMPTY"
# api_base = "add azure deployment link"

# for openai
# api_key = "EMPTY"
# api_base = DO NOT INCLUDE

#############################################################

api_details = {
     "api_base": "https://api.together.xyz/v1",
     "api_version": "2023-07-01-preview",
     "api_key": "91a65a9f1f3d858e4d7f4de4fdf14fde879712b254c81ca07c0cacc707b59d82",
}


model_short_name = 'mixtral' # 'gpt-4' (do not use other short names)
model = "mistralai/Mixtral-8x7B-Instruct-v0.1" # "gpt4_20230815" (use name of your model deployment)
llm_serving='together' # supported 'azure_openai', 'together', 'vllm'

seed = 0
ns = 20 # n_samples per class. e.g. if binary = 40 samples (i.e. 20 per class)
dataset = 'compas'
n_synthetic=10 # just to test --- normall should be 1000
n_processes = 5

# STEP 1: Generation

## Get dataset

In [20]:
rcc = pd.read_csv("rcctrain.csv")
rcc.rename(columns={'Groups': 'y'}, inplace=True)
df_feat = rcc.drop('y', axis=1)
df_label = rcc["y"]
df = rcc

In [21]:
#df_feat, df_label, df = get_data(dataset=dataset, seed=seed)
df_feat.to_csv("dffeat")
df_label.to_csv("df_label")
df.to_csv("df")

X_train, X_remain, y_train, y_remain = sample_and_split(df_feat, df_label, ns=ns, seed=seed)

X_val, X_test, y_val, y_test = train_test_split(
    X_remain, y_remain, test_size=0.5, random_state=seed
)


X_train_orig = deepcopy(X_train)
y_train_orig = deepcopy(y_train)

## Setup Prompt

In [22]:
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser


response_schemas = []

example_df = pd.concat([X_train_orig, y_train_orig], axis=1)

# Shuffle
example_df = example_df.sample(frac=1).reset_index(drop=True)


for idx, col in enumerate(list(example_df.columns)):
    if col == 'y':
        resp = ResponseSchema(name='y',
                        description=f"binary label, {col}", )
    else:
        resp = ResponseSchema(name=col,
                        description=f"feature column", )
    response_schemas.append(resp)

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()



generator_template = """\
You are a synthetic data generator. 
Your goal is to produce data which mirrors \
the given examples in causal structure and feature and label distributions \
but also produces as diverse samples as possible

I will give you real examples first

Leverage your knowledge about criminal recividsm to generate 1000 realistic but diverse samples. 

example data: {data}

{format_instructions}

DO NOT COPY THE EXAMPLES but generate realistic but new and diverse samples which have the correct label conditioned on the features.
"""


prompt = ChatPromptTemplate.from_template(template=generator_template)

## Generate using LLM

In [23]:
retries = 4  # Max retries you want to attempt

while retries > 0:
    try:

        if len(example_df)>20:
            ic_samples=20
        else:
            ic_samples=len(example_df)
        
        print(f'Running {dataset}, {seed}, {model} --- {n_processes}')
        df_llm = llm_gen(prompt, generator_template, format_instructions, example_df, 
                        n_samples=n_synthetic,
                        temperature=0.9,
                        max_tokens=1000, model=model, 
                        n_processes=n_processes,
                        ic_samples=ic_samples, 
                        llm_serving=llm_serving, 
                        api_details=api_details)
    
        
        break  # if successful, break out of the loop
    except Exception as e:
        time.sleep(120)
        print(f"Error: {e}. Retrying with reduced n_processes...")
        n_processes = int(n_processes/2)
        retries -= 1
        if n_processes < 1:
            print("Error: Minimum n_processes reached. Exiting...")
            break
# try:
tmp_df = df_llm.astype(example_df.dtypes)
df_llm = tmp_df
# except:
#     pass

Running compas, 0, mistralai/Mixtral-8x7B-Instruct-v0.1 --- 5
idx: 1
Error occurred: idx - 1, llm_serving - together
df_tmp not previously created should be:   hippurate-mannitol  Dibutylamine hippuric acid 2-mercaptobenzothiazole  \
0        4893512.364   69812.01235   64703.82547             56825.12456   
1        3376557.234  116876.32541   12789.43265              489.234567   
2        1526789.654   70987.32574   40892.21564              2288.98765   

  N-acetyl-glucosaminic acid 2-phenylacetamide lys-Ile/lys-leu  y  
0                54566.46558       5098520.321     1409892.567  0  
1                16095.32156       1945678.982     1756.321654  0  
2                44667.09876       4598765.321     676576.2354  0  
idx: 2
idx: 3
idx: 4
Current =  16 (16, 8)
Done...
16 (16, 8)


  df_tmp = df_tmp.append(df_check, ignore_index=True)
  df_tmp = df_tmp.append(df_check, ignore_index=True)
  df_tmp = df_tmp.append(df_check, ignore_index=True)
  df_tmp = df_tmp.append(df_check, ignore_index=True)


In [24]:
df_llm


Unnamed: 0,hippurate-mannitol,Dibutylamine,hippuric acid,2-mercaptobenzothiazole,N-acetyl-glucosaminic acid,2-phenylacetamide,lys-Ile/lys-leu,y
0,4893512.0,69812.01235,64703.82547,56825.12456,54566.46558,5098520.0,1409893.0,0
1,3376557.0,116876.32541,12789.43265,489.234567,16095.32156,1945679.0,1756.322,0
2,1526790.0,70987.32574,40892.21564,2288.98765,44667.09876,4598765.0,676576.2,0
3,4893512.0,69812.01235,64703.82547,56825.12456,54566.46558,5098520.0,1409893.0,0
4,3376557.0,116876.32541,12789.43265,489.234567,16095.32156,1945679.0,1756.322,0
5,1526790.0,70987.32574,40892.21564,2288.98765,44667.09876,4598765.0,676576.2,0
6,4328517.0,245680.4321,52146.32517,110641.2354,220234.6532,2320817.0,325146.5,0
7,315246.2,45612.3421,32521.1245,15236.1234,76542.5432,1234552.0,98651.21,0
8,452361.2,65421.1234,23523.1234,16523.2345,52365.4563,213513.5,76543.21,0
9,235246.1,65432.5432,32452.3453,16753.2345,54652.4653,235246.3,12345.35,1


In [25]:
len(df_llm)
print(df_llm.dtypes)

hippurate-mannitol            float64
Dibutylamine                  float64
hippuric acid                 float64
2-mercaptobenzothiazole       float64
N-acetyl-glucosaminic acid    float64
2-phenylacetamide             float64
lys-Ile/lys-leu               float64
y                               int64
dtype: object


## Process LLM generated data to have the same data types

In [26]:
df_llm = df_llm.dropna()
df_llm = df_llm[~df_llm.apply(lambda row: any([isinstance(cell, str) and cell in ['integer', 'float', 'numeric', 'categorical', 'number', 'No', 'Yes', 'continuous', 'age in years', 'string'] for cell in row]), axis=1)]

example_df = deepcopy(X_train_orig)
example_df['y'] = deepcopy(y_train_orig)

try:
    df_llm = df_llm.astype(example_df.dtypes)
except:
    # Assuming the dtypes from the example_df['Dtrain'].dataframe() is what you want
    target_dtypes = example_df.dtypes.to_dict()

    problematic_rows = set()

    for col, dtype in target_dtypes.items():
        for index, value in df[col].items():
            try:
                _ = dtype.type(value)  # Try to convert the value
            except Exception:
                problematic_rows.add(index)

    # Convert the problematic rows to a list and sort them
    problematic_rows = sorted(list(problematic_rows))

    # Drop the problematic rows
    df_llm.drop(problematic_rows, inplace=True)

    # Identify rows where any cell is of type list
    rows_with_lists = df.applymap(lambda x: isinstance(x, list)).any(axis=1)

    # Drop those rows
    df_llm = df_llm[~rows_with_lists]

    df_llm = df_llm.astype(example_df.dtypes)


df_llm

Unnamed: 0,hippurate-mannitol,Dibutylamine,hippuric acid,2-mercaptobenzothiazole,N-acetyl-glucosaminic acid,2-phenylacetamide,lys-Ile/lys-leu,y
0,4893512.0,69812.01235,64703.82547,56825.12456,54566.46558,5098520.0,1409893.0,0
1,3376557.0,116876.32541,12789.43265,489.234567,16095.32156,1945679.0,1756.322,0
2,1526790.0,70987.32574,40892.21564,2288.98765,44667.09876,4598765.0,676576.2,0
3,4893512.0,69812.01235,64703.82547,56825.12456,54566.46558,5098520.0,1409893.0,0
4,3376557.0,116876.32541,12789.43265,489.234567,16095.32156,1945679.0,1756.322,0
5,1526790.0,70987.32574,40892.21564,2288.98765,44667.09876,4598765.0,676576.2,0
6,4328517.0,245680.4321,52146.32517,110641.2354,220234.6532,2320817.0,325146.5,0
7,315246.2,45612.3421,32521.1245,15236.1234,76542.5432,1234552.0,98651.21,0
8,452361.2,65421.1234,23523.1234,16523.2345,52365.4563,213513.5,76543.21,0
9,235246.1,65432.5432,32452.3453,16753.2345,54652.4653,235246.3,12345.35,1


In [27]:
df_llm.dtypes

hippurate-mannitol            float64
Dibutylamine                  float64
hippuric acid                 float64
2-mercaptobenzothiazole       float64
N-acetyl-glucosaminic acid    float64
2-phenylacetamide             float64
lys-Ile/lys-leu               float64
y                               int64
dtype: object

# STEP 2: Curation

In [28]:
#from src.curation import data_centric_curation
from cllm.curation import data_centric_curation
X_check = df_llm.drop(columns=['y'])
y_check = df_llm['y'].values.astype(int)

curation_metric = 'aleatoric'
curation_ythresh=0.2
curation_xthresh=0 #adaptive

easy_train, ambig_train, unlearnable_train, Curator_xgb = data_centric_curation(X_train_orig, y_train_orig, X_check, y_check, 
                 curation_metric=curation_metric, retrain=False, nest = 100, 
                 curation_ythresh=curation_ythresh, curation_xthresh=curation_xthresh)

curated_train_ids = np.concatenate((easy_train, ambig_train))
curated_train_ids, unlearnable_train

Using adaptive threshold


(array([ 0,  1,  2,  3,  4,  5,  6, 11, 12,  7,  8,  9, 10, 13, 14, 15],
       dtype=int64),
 array([], dtype=int64))

In [29]:
# Create a DataFrame with samples that pass curation (curated_train_ids)
df_curated = df_llm.iloc[curated_train_ids]

# Print information about the curated data
print(f"Shape of original generated data: {df_llm.shape}")
print(f"Shape of curated data: {df_curated.shape}")
print(f"Percentage of data retained after curation: {(len(curated_train_ids) / len(df_llm)) * 100:.2f}%")

# Display distribution of easy and ambiguous samples
print(f"\nNumber of easy samples: {len(easy_train)}")
print(f"Number of ambiguous samples: {len(ambig_train)}")
print(f"Number of unlearnable samples: {len(unlearnable_train)}")

# Calculate class distribution in curated data
y_curated = df_curated['y'].values
class_distribution = np.bincount(y_curated) / len(y_curated)
print("\nClass distribution in curated data:")
for i, prob in enumerate(class_distribution):
    print(f"Class {i}: {prob:.2f}")

# Optionally, save the curated data to a CSV file
df_curated.to_csv("rcc_samples.csv", index=False)
print("\nCurated samples saved to 'rcc_samples.csv'")


Shape of original generated data: (16, 8)
Shape of curated data: (16, 8)
Percentage of data retained after curation: 100.00%

Number of easy samples: 9
Number of ambiguous samples: 7
Number of unlearnable samples: 0

Class distribution in curated data:
Class 0: 0.75
Class 1: 0.25

Curated samples saved to 'rcc_samples.csv'
