# Training classifier based on manual labels

In [1]:
from langchain_openai import OpenAIEmbeddings
import os
import numpy as np
import pandas as pd

with open("./openai.key", "r") as f:
    os.environ["OPENAI_API_KEY"] = f.read().strip()

models = [
    "text-embedding-ada-002",
    "text-embedding-3-small",
    "text-embedding-3-large",
]

In [2]:
# Load dataset
df = pd.read_feather("./manual_eval_labeled.feather").dropna()
df['sentence'] = df['Prompt'] + ' ' + df['answer']
df['label'] = df['eval']
df = df[['sentence', 'label']]
# Embed dataset for each model
for model in models:
    embeddings = OpenAIEmbeddings(model=model)
    df[model] = embeddings.embed_documents(df['sentence'])
    df[model] = df[model].apply(lambda x: np.array(x, dtype=np.float64))

In [3]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import cohen_kappa_score
import autosklearn.classification
import pickle

res = []

# split data into train and test
for model in models:
    print(model)
    rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=42)
    for i, (train_index, test_index) in enumerate(rskf.split(list(df[model].values), list(df.label))):
        print(f"Fold {i}")
        # get indices in list
        X_train, X_test = list(df[model].values[train_index]), list(df[model].values[test_index])
        y_train, y_test = list(df.label.values[train_index]), list(df.label.values[test_index])
        # train model
        automl = autosklearn.classification.AutoSklearnClassifier(
            time_left_for_this_task=4*60,
            per_run_time_limit=20,
            n_jobs=-1,
            memory_limit=16*1024,	
        )
        automl.fit(X_train, y_train)
        preds = automl.predict(X_test)
        probas = automl.predict_proba(X_test)
        kappa = cohen_kappa_score(pd.DataFrame(y_test).replace({"R": 0, "H": 1, "W": 2,  "A": 3}), pd.DataFrame(preds).replace({"R": 0, "H": 1, "W": 2,  "A": 3}), weights="quadratic")
        print("Cohens Kappa", kappa)
        res.append({
            "model": model,
            "fold": i,
            "cohen_kappa": kappa,
        })
        # save model
        pickle.dump(automl, open(f"./classification_models/automl_{model}_{i}.pkl", "wb"))

text-embedding-ada-002
Fold 0
Cohens Kappa 0.8057887267690372
Fold 1
Cohens Kappa 0.8053433051974018
Fold 2
Cohens Kappa 0.8192385065556242
Fold 3
Cohens Kappa 0.8099945295595424
Fold 4
Cohens Kappa 0.7978275296007824
Fold 5
Cohens Kappa 0.8026615026256667
Fold 6
Cohens Kappa 0.823957277698419
Fold 7
Cohens Kappa 0.8017834518688497
Fold 8
Cohens Kappa 0.8114289469792814
Fold 9
Cohens Kappa 0.7701759446207095
text-embedding-3-small
Fold 0
Cohens Kappa 0.8295733487220988
Fold 1
Cohens Kappa 0.8324028982064378
Fold 2
Cohens Kappa 0.8151788408274178
Fold 3
Cohens Kappa 0.8266733566777028
Fold 4
Cohens Kappa 0.8235629383484412
Fold 5
Cohens Kappa 0.7958337192906619
Fold 6
Cohens Kappa 0.8348721852726297
Fold 7
Cohens Kappa 0.7787696733795905
Fold 8
Cohens Kappa 0.8540045006158548
Fold 9
Cohens Kappa 0.8063990456816028
text-embedding-3-large
Fold 0
	Models besides current dummy model: 0
	Dummy models: 1
Cohens Kappa 0.797234827168197
Fold 1
Cohens Kappa 0.848661037924444
Fold 2
	Models besid

In [7]:
# collect results
df = pd.DataFrame(res)
# calculate mean and std
aggs = df.groupby("model").agg({"cohen_kappa": ["mean", "std"]})
print(aggs)

                       cohen_kappa          
                              mean       std
model                                       
text-embedding-3-large    0.821845  0.017446
text-embedding-3-small    0.819727  0.021529
text-embedding-ada-002    0.804820  0.014563


In [8]:
# find best model
best = df.sort_values("cohen_kappa", ascending=False).head(1)
best

Unnamed: 0,model,fold,cohen_kappa
18,text-embedding-3-small,8,0.854005


In [9]:
import pickle
import zlib

# load model
automl = pickle.load(open(f"./classification_models/automl_{best['model'].values[0]}_{best['fold'].values[0]}.pkl", "rb"))
# compress model
compressed = zlib.compress(pickle.dumps(automl))
# save model
with open("./classification_models/best.pkl.zlib", "wb") as f:
    f.write(compressed)