In [155]:
import pandas as pd

train_data = pd.read_csv('train.csv', encoding= 'ANSI')
train_data.columns =['prompt','completion']
train_data.sample(frac=1, random_state=42)
nb= round(100*len(train_data[train_data['completion']!='battery'])/len(train_data))
bb= round(100*len(train_data[train_data['completion']=='battery'])/len(train_data))
nb_data = train_data[train_data['completion']!='battery'].sample(n=nb, replace=False)
bb_data = train_data[train_data['completion']=='battery'].sample(n=bb, replace=False)
sample = pd.concat([nb_data, bb_data])
sample
sample.to_json("train.jsonl", orient='records', lines=True)

In [156]:
!openai tools fine_tunes.prepare_data -f train.jsonl -q

Analyzing...

- Your file contains 100 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more det

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size=0.04, shuffle = True, random_state=42, stratify = data['completion'])
test_df.to_json("train.jsonl", orient='records', lines=True)
len(train_df), len(test_df)

In [None]:
!openai api fine_tunes.follow -i "NAME OF YOUR FINE_TUNED MODEL"

In [2]:
import pandas as pd

data = pd.read_csv('test.csv')
data.columns =['prompt','completion']
data

Unnamed: 0,prompt,completion
0,The synthesis of lambda-manganese oxide (λ-Mn...,battery
1,Introduction Contact dermatitis to cosmetics i...,non-battery
2,The passivation of pure Zn (99.995wt%) and Zn–...,battery
3,The corrosion of titanium in H2SO4 electrolyte...,battery
4,The presence of vascular neurocognitive impair...,non-battery
...,...,...
4662,An air-breathing polymer electrolyte membrane ...,battery
4663,A fuel cell stack needs to be stable and high-...,battery
4664,While the implications of information and comm...,non-battery
4665,The cycling performance of LiPF4(C2O4) electro...,battery


In [7]:
test=pd.read_csv('test.csv')
test.columns =['prompt','completion']
test

Unnamed: 0,prompt,completion
0,The synthesis of lambda-manganese oxide (λ-Mn...,battery
1,Introduction Contact dermatitis to cosmetics i...,non-battery
2,The passivation of pure Zn (99.995wt%) and Zn–...,battery
3,The corrosion of titanium in H2SO4 electrolyte...,battery
4,The presence of vascular neurocognitive impair...,non-battery
...,...,...
4662,An air-breathing polymer electrolyte membrane ...,battery
4663,A fuel cell stack needs to be stable and high-...,battery
4664,While the implications of information and comm...,non-battery
4665,The cycling performance of LiPF4(C2O4) electro...,battery


In [8]:
import tiktoken
from openai.embeddings_utils import get_embedding
import pickle
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
top_n = 1000


encoding = tiktoken.get_encoding(embedding_encoding)
test["n_tokens"] = test.prompt.apply(lambda x: len(encoding.encode(x)))


In [22]:
import openai
your_key = 'YOUR KEY'
openai.api_key = your_key

In [None]:
!openai api fine_tunes.list

In [11]:
test = test[test.n_tokens <= max_tokens]
test["embedding"] = test.prompt.apply(lambda x: get_embedding(x, engine=embedding_model))


In [12]:

with open('test_with_ada_embedding.pkl', 'wb') as f:
    pickle.dump(test, f)

In [16]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix

EMBEDDING_MODEL = "text-embedding-ada-002"


test["embedding"] = test.embedding.apply(np.array)
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import PrecisionRecallDisplay

def evaluate_embeddings_approach(
    labels = ['battery', 'non-battery'], 
    model = EMBEDDING_MODEL,
):
    label_embeddings = [get_embedding(label, engine=model) for label in labels]

    def label_score(review_embedding, label_embeddings):
        return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])

    probas = test["embedding"].apply(lambda x: label_score(x, label_embeddings))
    preds = probas.apply(lambda x: 'non-battery' if x>0 else 'battery')
    print(confusion_matrix(test.completion, preds))
    report = classification_report(test.completion, preds, digits=3)

    print(report)

    return probas, preds
    
    

evaluate_embeddings_approach(labels=['battery', 'non-battery'], model=EMBEDDING_MODEL)

[[2948    0]
 [1718    1]]
              precision    recall  f1-score   support

     battery      0.632     1.000     0.774      2948
 non-battery      1.000     0.001     0.001      1719

    accuracy                          0.632      4667
   macro avg      0.816     0.500     0.388      4667
weighted avg      0.767     0.632     0.490      4667



(0      -0.038372
 1      -0.056095
 2      -0.037259
 3      -0.043983
 4      -0.031283
           ...   
 4662   -0.031761
 4663   -0.059414
 4664   -0.058902
 4665   -0.045896
 4666   -0.021231
 Name: embedding, Length: 4667, dtype: float64,
 0       battery
 1       battery
 2       battery
 3       battery
 4       battery
          ...   
 4662    battery
 4663    battery
 4664    battery
 4665    battery
 4666    battery
 Name: embedding, Length: 4667, dtype: object)

In [5]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR KEY"


In [None]:
!openai api fine_tunes.create -t "train.jsonl" -v "test.jsonl" --compute_classification_metrics --classification_positive_class "NAME OF POSITIVE CLASS "

In [142]:
evaluate_embeddings_approach(labels=['battery materials', 'diverse domains'])

[[2886   62]
 [ 531 1188]]
              precision    recall  f1-score   support

     battery      0.845     0.979     0.907      2948
 non-battery      0.950     0.691     0.800      1719

    accuracy                          0.873      4667
   macro avg      0.898     0.835     0.854      4667
weighted avg      0.884     0.873     0.868      4667



In [21]:
prob, pred =evaluate_embeddings_approach(labels=['papers related to battery materials', 'papers on application in medical and psychological research'])

out = pd.DataFrame({'prob': prob, 'pred': pred})

[[2871   77]
 [ 264 1455]]
              precision    recall  f1-score   support

     battery      0.916     0.974     0.944      2948
 non-battery      0.950     0.846     0.895      1719

    accuracy                          0.927      4667
   macro avg      0.933     0.910     0.920      4667
weighted avg      0.928     0.927     0.926      4667



In [14]:
evaluate_embeddings_approach(labels=['papers related to battery energy materials', 'medical and psychological research'])

[[2916   32]
 [ 297 1422]]
              precision    recall  f1-score   support

     battery      0.908     0.989     0.947      2948
 non-battery      0.978     0.827     0.896      1719

    accuracy                          0.930      4667
   macro avg      0.943     0.908     0.921      4667
weighted avg      0.934     0.930     0.928      4667



In [143]:
evaluate_embeddings_approach(labels=['battery materials', 'medical and psychological research'])

[[2899   49]
 [ 373 1346]]
              precision    recall  f1-score   support

     battery      0.886     0.983     0.932      2948
 non-battery      0.965     0.783     0.864      1719

    accuracy                          0.910      4667
   macro avg      0.925     0.883     0.898      4667
weighted avg      0.915     0.910     0.907      4667

