# Mistral Fine-tuning API

Check out the docs: https://docs.mistral.ai/capabilities/finetuning/

In [None]:
#!pip install mistralai pandas

## Prepare the dataset

In this example, let’s use the ultrachat_200k dataset. We load a chunk of the data into Pandas Dataframes, split the data into training and validation, and save the data into the required jsonl format for fine-tuning.

In [1]:
import pandas as pd
df = pd.read_json('./data/generated_news_correction.jsonl', lines=True)

df_train=df.sample(frac=0.995,random_state=200)
df_eval=df.drop(df_train.index)

df_train.to_json("news_chunk_train.jsonl", orient="records", lines=True)
df_eval.to_json("news_chunk_eval.jsonl", orient="records", lines=True)

In [2]:
!ls -lh

total 294824
-rw-r--r--   1 pierrebittner  staff   3,5K 21 jui 16:49 Step 2 - synthetize news.py
-rw-r--r--   1 pierrebittner  staff   1,8K 21 jui 17:00 Step 3 - critique.py
-rw-r--r--   1 pierrebittner  staff   6,4K 21 jui 17:30 Step 4 - specific-style.py
-rw-r--r--   1 pierrebittner  staff   6,5K 19 jui 16:19 Untitled.ipynb
drwxr-xr-x   5 pierrebittner  staff   160B 21 jui 17:06 [34m__pycache__[m[m
-rw-r--r--   1 pierrebittner  staff   1,1K 21 jui 15:53 concatenate.py
drwxr-xr-x  16 pierrebittner  staff   512B 21 jui 17:40 [34mdata[m[m
-rw-r--r--@  1 pierrebittner  staff    39K 21 jui 16:19 mistral_finetune_api.ipynb
-rw-r--r--   1 pierrebittner  staff    45K 21 jui 16:12 mistral_finetune_api_news.ipynb
-rw-r--r--   1 pierrebittner  staff    12K 21 jui 17:40 news_chunk_eval.jsonl
-rw-r--r--   1 pierrebittner  staff   2,5M 21 jui 17:40 news_chunk_train.jsonl
-rw-r--r--   1 pierrebittner  staff   447B 21 jui 11:29 prompts.py
-rw-r--r--   1 pierrebittner  staff   3,3K 19 jui 11:53

## Reformat dataset
If you upload this ultrachat_chunk_train.jsonl to Mistral API, you might encounter an error message “Invalid file format” due to data formatting issues. To reformat the data into the correct format, you can download the reformat_dataset.py script and use it to validate and reformat both the training and evaluation data:

In [None]:
# download the validation and reformat script
!wget https://raw.githubusercontent.com/mistralai/mistral-finetune/main/utils/reformat_data.py

In [3]:
# validate and reformat the training data
!python reformat_data.py news_chunk_train.jsonl

In [4]:
# validate the reformat the eval data
!python reformat_data.py news_chunk_eval.jsonl

In [6]:
df_train.iloc[104]['messages']

[{'role': 'user',
  'content': 'As a "News Article Stylist" following the Economist style guide,\nyour task is to refine and rewrite news articles to ensure they meet the high standards of clarity,\nprecision, and sophistication characteristic of the Economist.\nYou are now given a news article.\nRead the news article carefully and point out all stylistic issues of the given news article according\nto the Economist style guide. Do not rewrite the news article. \n\nCommunity Center Offers Free Workshops on Diversity and Inclusion\n\nDavid Lee\n\n2022-04-05\n\nA community center is offering a series of free workshops on diversity, inclusion, and social justice to educate and empower residents on important social issues. The workshops cover topics such as unconscious bias, privilege, allyship, and cultural competence, providing participants with tools and resources to create more inclusive and equitable communities. The initiative has received positive feedback from attendees who apprecia

## Upload dataset

In [7]:
import os
from mistralai.client import MistralClient

api_key = os.environ.get("MISTRAL_API_KEY")
client = MistralClient(api_key=api_key)

with open("news_chunk_train.jsonl", "rb") as f:
    news_chunk_train = client.files.create(file=("news_chunk_train.jsonl", f))
with open("news_chunk_eval.jsonl", "rb") as f:
    news_chunk_eval = client.files.create(file=("news_chunk_eval.jsonl", f))

In [8]:
import json
def pprint(obj):
    print(json.dumps(obj.dict(), indent=4))

In [9]:
pprint(news_chunk_train)

{
    "id": "f6c21839-2087-4853-be42-792c2a7b3359",
    "object": "file",
    "bytes": 2669675,
    "created_at": 1718984527,
    "filename": "news_chunk_train.jsonl",
    "purpose": "fine-tune"
}


In [10]:
pprint(news_chunk_eval)

{
    "id": "5174de44-8a5b-4197-9264-a6dffa1739f0",
    "object": "file",
    "bytes": 12343,
    "created_at": 1718984527,
    "filename": "news_chunk_eval.jsonl",
    "purpose": "fine-tune"
}


## Create a fine-tuning job

In [11]:
from mistralai.models.jobs import TrainingParameters

created_jobs = client.jobs.create(
    model="mistral-small-latest", #"open-mistral-7b", 
    training_files=[news_chunk_train.id],
    validation_files=[news_chunk_eval.id],
    hyperparameters=TrainingParameters(
        training_steps=10,
        learning_rate=0.0001,
        )
)

In [12]:
pprint(created_jobs)

{
    "id": "d3a4936b-4f52-498a-a5bd-5580b52a19a0",
    "hyperparameters": {
        "training_steps": 10,
        "learning_rate": 0.0001
    },
    "fine_tuned_model": null,
    "model": "mistral-small-latest",
    "status": "QUEUED",
    "job_type": "FT",
    "created_at": 1718984557,
    "modified_at": 1718984557,
    "training_files": [
        "f6c21839-2087-4853-be42-792c2a7b3359"
    ],
    "validation_files": [
        "5174de44-8a5b-4197-9264-a6dffa1739f0"
    ],
    "object": "job",
    "integrations": []
}


In [13]:
import time

retrieved_job = client.jobs.retrieve(created_jobs.id)
while retrieved_job.status in ["RUNNING", "QUEUED"]:
    retrieved_job = client.jobs.retrieve(created_jobs.id)
    pprint(retrieved_job)
    print(f"Job is {retrieved_job.status}, waiting 10 seconds")
    time.sleep(10)



{
    "id": "d3a4936b-4f52-498a-a5bd-5580b52a19a0",
    "hyperparameters": {
        "training_steps": 10,
        "learning_rate": 0.0001
    },
    "fine_tuned_model": null,
    "model": "mistral-small-latest",
    "status": "RUNNING",
    "job_type": "FT",
    "created_at": 1718984557,
    "modified_at": 1718984557,
    "training_files": [
        "f6c21839-2087-4853-be42-792c2a7b3359"
    ],
    "validation_files": [
        "5174de44-8a5b-4197-9264-a6dffa1739f0"
    ],
    "object": "job",
    "integrations": [],
    "events": [
        {
            "name": "status-updated",
            "data": {
                "status": "RUNNING"
            },
            "created_at": 1718984557
        },
        {
            "name": "status-updated",
            "data": {
                "status": "QUEUED"
            },
            "created_at": 1718984557
        }
    ],
    "checkpoints": [],
    "estimated_start_time": null
}
Job is RUNNING, waiting 10 seconds
{
    "id": "d3a4936b-4f

In [None]:
# List jobs
jobs = client.jobs.list()
pprint(jobs)

In [None]:
# Retrieve a jobs
retrieved_jobs = client.jobs.retrieve(created_jobs.id)
pprint(retrieved_jobs)


## Use a fine-tuned model

In [None]:
from mistralai.models.chat_completion import ChatMessage

chat_response = client.chat(
    model=retrieved_jobs.fine_tuned_model,
    messages=[ChatMessage(role="user", content="You are a 'News Writer'. "
    "Your task is to write a news of 500–1000 words on 'Environment and Climate'."
    "Each news should have a title, content, author, location and date."]
)

In [None]:
pprint(chat_response)

## Integration with Weights and Biases
We can also offer support for integration with Weights & Biases (W&B) to monitor and track various metrics and statistics associated with our fine-tuning jobs. To enable integration with W&B, you will need to create an account with W&B and add your W&B information in the “integrations” section in the job creation request:



In [None]:
from mistralai.models.jobs import WandbIntegrationIn

WANDB_API_KEY = "XXX"

created_jobs = client.jobs.create(
    model="open-mistral-7b",
    training_files=[ultrachat_chunk_train.id],
    validation_files=[ultrachat_chunk_eval.id],
    hyperparameters=TrainingParameters(
        training_steps=100,
        learning_rate=0.0001,
    ),
    integrations=[
        WandbIntegrationIn(
            project="test_ft_api",
            run_name="test",
            api_key=WANDB_API_KEY,
        ).dict()
    ],
)