## Create dataset on Cohere

In [1]:
import json
import os
import sys

import cohere
import lancedb
import pandas as pd
from time import sleep
from cohere.finetuning import BaseModel, FinetunedModel, Settings, WandbConfig

# Add the week1_bootstrap_evals to path to import scoring_utils
sys.path.append(os.path.abspath("../week1_bootstrap_evals"))

from scoring_utils import EvalQuestion, score_reranked_search

training_path = "../week1_bootstrap_evals/synthetic_finetune_dataset.jsonl"

co = cohere.Client()
dataset_creation_request = co.datasets.create(
    name="hardware-review-reranking-data",
    data=open(training_path, "rb"),
    type="reranker-finetune-input",
)
dataset_resp = co.wait(dataset_creation_request)
dataset = dataset_resp.dataset

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


...
...
...
...
...
...


## Train The Model

In [2]:
wandb_config = WandbConfig(
    project="hardware-review-reranker", api_key=os.environ["WANDB_API_KEY"]
)
finetune_request = co.finetuning.create_finetuned_model(
    request=FinetunedModel(
        name="hardware-review-reranker-900",
        settings=Settings(
            base_model=BaseModel(base_type="BASE_TYPE_RERANK"),
            dataset_id=dataset.id,
            wandb=wandb_config,
        ),
    )
)
model_id = finetune_request.finetuned_model.id

for pings in range(100):
    response = co.finetuning.get_finetuned_model(model_id)
    if response.finetuned_model.status == "STATUS_READY":
        break
    if pings % 5 == 0:
        print(f"{pings} pings. Current status: {response.finetuned_model.status}")
    sleep(180)

model_callable_id = f"{model_id}-ft"
model_callable_id

0 pings. Current status: STATUS_QUEUED
5 pings. Current status: STATUS_FINETUNING
10 pings. Current status: STATUS_FINETUNING
15 pings. Current status: STATUS_FINETUNING


'49f7309d-4822-4c74-9882-d219fb1f6bd0-ft'

## Test The Model

In [3]:
with open("../week1_bootstrap_evals/synthetic_eval_dataset.json", "r") as f:
    synthetic_questions = json.load(f)
eval_questions = [EvalQuestion(**question) for question in synthetic_questions]

db = lancedb.connect("../week1_bootstrap_evals/lancedb")
reviews_table = db.open_table("reviews")

k_to_retrieve = [5, 10]
reranked_scores = score_reranked_search(
    eval_questions, reviews_table, k_to_retrieve, model=model_callable_id
)
reranked_scores_df = pd.DataFrame(
    [
        {"precision": scores["precision"], "recall": scores["recall"], "n_retrieved": k}
        for k, scores in reranked_scores.items()
    ]
)

reranked_scores_df

Unnamed: 0,precision,recall,n_retrieved
0,0.129111,0.645556,5
1,0.099222,0.992222,10
