# Generate synthetic data for training

In [1]:
from mistralai import Field, Mistral
import os
from prompts import get_prompt_for_theme
import asyncio
from tqdm import tqdm
import random
import nest_asyncio
from pydantic import BaseModel
from typing import List
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

MISTRAL_API_KEY=os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=MISTRAL_API_KEY)

class Questions(BaseModel):
    questions: List[str] = Field(..., description="Une liste de questions à propos d'un thème donné")


async def _generate_queries(theme):
    await asyncio.sleep(random.uniform(0, 10))

    prompt = get_prompt_for_theme(theme)

    response = await client.chat.parse_async(
        model="mistral-large-latest",
        messages=[
            {"role": "user", "content": prompt}
        ],
        response_format=Questions
    )
    return {"questions":response.choices[0].message.parsed.questions, "theme": theme}

async def main():
    themes = ["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]
    all_tasks = []

    for theme in themes:
        print(f"Setting up tasks for theme: {theme}")
        all_tasks.extend([_generate_queries(theme) for _ in range(20)])
    
    total_tasks = len(all_tasks)
    print(f"Processing {total_tasks} tasks in batches of 20")
    
    results = []
    batch_size = 20

    for i in range(0, total_tasks, batch_size):
        batch_tasks = all_tasks[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} ({len(batch_tasks)} tasks)")
        
        try:
            batch_results = await tqdm_asyncio.gather(*batch_tasks)
            results.extend(batch_results)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            continue



    return results

q_theme_df = asyncio.run(main())

q_theme_df

Setting up tasks for theme: other
Processing 20 tasks in batches of 20
Processing batch 1 (20 tasks)


100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


[{'questions': ['Quelles sont les meilleures pratiques pour la rotation des cultures ?',
   "Comment optimiser l'irrigation pour économiser l'eau ?",
   "Quels sont les avantages et les inconvénients de l'agriculture biologique ?",
   'Comment gérer les parasites et les maladies de manière écologique ?',
   'Quelles sont les techniques de conservation des sols les plus efficaces ?',
   'Comment améliorer la fertilité du sol naturellement ?',
   "Quels sont les impacts du changement climatique sur l'agriculture locale ?",
   "Comment intégrer l'agroforesterie dans une exploitation agricole ?",
   'Quelles sont les méthodes de compostage les plus efficaces pour les déchets agricoles ?',
   'Comment utiliser les drones pour surveiller les cultures ?'],
  'theme': 'other'},
 {'questions': ['Quelles sont les meilleures pratiques pour la gestion des sols en agriculture biologique ?',
   "Comment optimiser l'irrigation pour réduire la consommation d'eau ?",
   "Quels sont les avantages et les

# Evaluate base model

In [2]:
import pandas as pd

question_theme_pairs_df = pd.read_csv("question_theme_pairs.csv")
print(f"Number of lines before removing duplicates : {len(question_theme_pairs_df)}")


question_theme_pairs_df = question_theme_pairs_df.drop_duplicates(subset=['question', 'theme'])

print(f"Number of lines : {len(question_theme_pairs_df)}")
print(question_theme_pairs_df['theme'].value_counts())

question_theme_pairs_df

Number of lines before removing duplicates : 3194
Number of lines : 2254
theme
market_question       1118
policy_help            382
disease_diagnosis      378
weather_management     206
other                  170
Name: count, dtype: int64


Unnamed: 0,question,theme
0,Quelles sont les meilleures pratiques pour la ...,other
1,Comment optimiser l'irrigation pour économiser...,other
2,Quels sont les avantages et les inconvénients ...,other
3,Comment gérer les parasites et les maladies de...,other
4,Quelles sont les techniques de conservation de...,other
...,...,...
3188,Dois-je protéger mes vignes contre le vent for...,weather_management
3189,Les conditions météorologiques sont-elles appr...,weather_management
3190,Y aura-t-il des orages cette semaine qui pourr...,weather_management
3192,Dois-je récolter mes pommes de terre avant les...,weather_management


<h3> We have too much market_question compared to the other classes, let's drop some </h3>

In [3]:
market_question_indices = question_theme_pairs_df[question_theme_pairs_df['theme'] == 'market_question'].index
indices_to_drop = market_question_indices.to_series().sample(n=600).index

question_theme_pairs_df = question_theme_pairs_df.drop(indices_to_drop).reset_index(drop=True)

print(f"Dropped {len(indices_to_drop)} 'market_question' questions.")
print(question_theme_pairs_df['theme'].value_counts())

Dropped 600 'market_question' questions.
theme
market_question       518
policy_help           382
disease_diagnosis     378
weather_management    206
other                 170
Name: count, dtype: int64


In [1]:
from typing import Literal
from pydantic import BaseModel

class Theme(BaseModel):
    theme: Literal["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]    

In [48]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    question_theme_pairs_df,
    test_size=0.1,
    stratify=question_theme_pairs_df['theme']
)

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print("Train class distribution:\n", train_df['theme'].value_counts())
print("Test class distribution:\n", test_df['theme'].value_counts())

Train set size: 1488
Test set size: 166
Train class distribution:
 theme
market_question       466
policy_help           344
disease_diagnosis     340
weather_management    185
other                 153
Name: count, dtype: int64
Test class distribution:
 theme
market_question       52
disease_diagnosis     38
policy_help           38
weather_management    21
other                 17
Name: count, dtype: int64


# Evaluation of ministral-3b

In [None]:
from tqdm import tqdm
import time

def predict(question, model):    
    prompt = (
        "Classify the following question into one of the following themes: "
        "'market_question', policy_help, disease_diagnosis, weather_management, other.\n\n"
        f"Question: {question}\n\nTheme:"
    )
    response = client.chat.parse(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format=Theme,
    )
    return response.choices[0].message.parsed.theme

def evaluate_model(test_data_df, model):
    y_true = test_data_df["theme"].tolist()
    y_pred = []
    for i, q in enumerate(tqdm(test_data_df["question"], desc="Evaluating")):
        if i > 0 and i % 5 == 0: # Sleep every 5 iterations to avoid rate limiting
            time.sleep(2)
        y_pred.append(predict(q, model))

    themes = ["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]
    metrics = {}
    for theme in themes:
        tp = sum((yt == theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fp = sum((yt != theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fn = sum((yt == theme) and (yp != theme) for yt, yp in zip(y_true, y_pred))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        metrics[theme] = {"precision": precision, "recall": recall, "f1": f1}
    macro_avg = {
        k: sum(metrics[t][k] for t in themes) / len(themes)
        for k in ["precision", "recall", "f1"]
    }
    return {"per_class": metrics, "macro_avg": macro_avg}

results = evaluate_model(test_df, model="ministral-3b-latest")

print("Model evaluation results:")
print(f"Macro Average: Precision: {results['macro_avg']['precision']:.3f}, Recall: {results['macro_avg']['recall']:.3f}, F1: {results['macro_avg']['f1']:.3f}")
print("\nPer-class metrics:")
for theme, m in results['per_class'].items():
    print(f"{theme}: F1 = {m['f1']:.3f}")


Evaluating: 100%|██████████| 166/166 [02:37<00:00,  1.05it/s]

Model evaluation results:
Macro Average: Precision: 0.743, Recall: 0.752, F1: 0.745

Per-class metrics:
market_question: F1 = 0.784
policy_help: F1 = 0.783
disease_diagnosis: F1 = 0.914
weather_management: F1 = 0.909
other: F1 = 0.333





# Finetuning

In [51]:
from tqdm import tqdm
import json

def dataset_to_jsonl(dataset, output_file):
    # Open the output file in write mode
    with open(output_file, "w") as f:
        # Iterate over each row in the dataset
        for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
            # Extract the text and label from the row
            question = row["question"]
            theme = row["theme"]

            # Create the JSON object with the desired structure
            json_object = {"text": question, "labels": {"intent": theme}}

            # Write the JSON object to the file as a JSON line
            f.write(json.dumps(json_object) + "\n")

# Save files
dataset_to_jsonl(train_df, "train.jsonl")
dataset_to_jsonl(test_df, "test.jsonl")

100%|██████████| 1488/1488 [00:00<00:00, 47643.99it/s]
100%|██████████| 166/166 [00:00<00:00, 49049.28it/s]


In [52]:
# Upload the training data
training_data = client.files.upload(
    file={
        "file_name": "train.jsonl",
        "content": open("train.jsonl", "rb"),
    }
)

# Upload the validation data
validation_data = client.files.upload(
    file={
        "file_name": "validation.jsonl",
        "content": open("test.jsonl", "rb"),
    }
)

In [None]:
# Create a fine-tuning job
wandb_key = os.getenv("WANDB_KEY")

created_job = client.fine_tuning.jobs.create(
    model="ministral-3b-latest",
    job_type="classifier",
    training_files=[{"file_id": training_data.id, "weight": 1}],
    validation_files=[validation_data.id],
    suffix="agro-intent-clf",
    hyperparameters={"training_steps": 50, "learning_rate":0.0001},
    auto_start=True,
    integrations=[
        {
            "project": "intent-classifier",
            "api_key": wandb_key,
        }
    ]
)
print(json.dumps(created_job.model_dump(), indent=4))

{
    "id": "a0b2cfa8-0719-44e6-961a-d5c7c12fbc28",
    "auto_start": true,
    "model": "ministral-3b-latest",
    "status": "QUEUED",
    "created_at": 1745298997,
    "modified_at": 1745298997,
    "training_files": [
        "ac308888-4430-499b-b327-5bac94743e9a"
    ],
    "hyperparameters": {
        "training_steps": 50,
        "learning_rate": 0.0001,
        "weight_decay": 0.1,
        "warmup_fraction": 0.05,
        "epochs": null,
        "seq_len": 16384
    },
    "validation_files": [
        "16bda9a9-ca7a-44fb-80c3-a523cbd677b1"
    ],
    "object": "job",
    "fine_tuned_model": null,
    "suffix": "agro-intent-clf",
    "integrations": [
        {
            "project": "intent-classifier",
            "type": "wandb",
            "name": null,
            "run_name": null,
            "url": null
        }
    ],
    "trained_tokens": null,
    "metadata": {
        "expected_duration_seconds": null,
        "cost": 0.0,
        "cost_currency": null,
        "tra

In [58]:
# Retrieve the job details again
retrieved_job = client.fine_tuning.jobs.get(job_id=created_job.id)
print(json.dumps(retrieved_job.model_dump(), indent=4))

{
    "id": "a0b2cfa8-0719-44e6-961a-d5c7c12fbc28",
    "auto_start": true,
    "model": "ministral-3b-latest",
    "status": "SUCCESS",
    "created_at": 1745298997,
    "modified_at": 1745299106,
    "training_files": [
        "ac308888-4430-499b-b327-5bac94743e9a"
    ],
    "hyperparameters": {
        "training_steps": 50,
        "learning_rate": 0.0001,
        "weight_decay": 0.1,
        "warmup_fraction": 0.05,
        "epochs": 56.95810881279333,
        "seq_len": 16384
    },
    "classifier_targets": [
        {
            "name": "intent",
            "labels": [
                "weather_management",
                "disease_diagnosis",
                "market_question",
                "other",
                "policy_help"
            ],
            "weight": 1.0,
            "loss_function": "single_class"
        }
    ],
    "validation_files": [
        "16bda9a9-ca7a-44fb-80c3-a523cbd677b1"
    ],
    "object": "job",
    "fine_tuned_model": "ft:classifier:minis

In [62]:
# Load the test samples
with open("test.jsonl", "r") as f:
    test_samples = [json.loads(l) for l in f.readlines()]

# Classify the first test sample
classifier_response = client.classifiers.classify(
    model=retrieved_job.fine_tuned_model,
    inputs=[test_samples[1]["text"]],
)
print("Text:", test_samples[1]["text"])
print("Classifier Response:", json.dumps(classifier_response.model_dump(), indent=4))


Text: Quelles sont les obligations de rotation des cultures pour les exploitations recevant des aides publiques ?
Classifier Response: {
    "id": "a4506bd4d7cd4bc2b24d16376cd89989",
    "model": "ft:classifier:ministral-3b-latest:82f3f89c:20250422:agro-intent-clf:a0b2cfa8",
    "results": [
        {
            "intent": {
                "scores": {
                    "weather_management": 4.1807697925833054e-07,
                    "disease_diagnosis": 9.930199240670845e-08,
                    "market_question": 8.699067984707654e-05,
                    "other": 1.2704710570687894e-05,
                    "policy_help": 0.9998997449874878
                }
            }
        }
    ]
}


In [74]:
from tqdm import tqdm
import time

def predict(question):    
    response = client.classifiers.classify(
        model=retrieved_job.fine_tuned_model,
        inputs=[question],
    )
    scores = response.results[0]['intent'].scores
    predicted_label = max(scores, key=scores.get)

    return predicted_label

def evaluate_model(test_data_df):
    y_true = test_data_df["theme"].tolist()
    y_pred = []
    for i, q in enumerate(tqdm(test_data_df["question"], desc="Evaluating")):
        if i > 0 and i % 5 == 0: # Sleep every 5 iterations to avoid rate limiting
            time.sleep(2)
        y_pred.append(predict(q))

    themes = ["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]
    metrics = {}
    for theme in themes:
        tp = sum((yt == theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fp = sum((yt != theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fn = sum((yt == theme) and (yp != theme) for yt, yp in zip(y_true, y_pred))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        metrics[theme] = {"precision": precision, "recall": recall, "f1": f1}
    macro_avg = {
        k: sum(metrics[t][k] for t in themes) / len(themes)
        for k in ["precision", "recall", "f1"]
    }
    return {"per_class": metrics, "macro_avg": macro_avg}

results = evaluate_model(test_df)

print("Model evaluation results:")
print(f"Macro Average: Precision: {results['macro_avg']['precision']:.3f}, Recall: {results['macro_avg']['recall']:.3f}, F1: {results['macro_avg']['f1']:.3f}")
print("\nPer-class metrics:")
for theme, m in results['per_class'].items():
    print(f"{theme}: F1 = {m['f1']:.3f}")


Evaluating: 100%|██████████| 166/166 [02:03<00:00,  1.34it/s]

Model evaluation results:
Macro Average: Precision: 0.974, Recall: 0.961, F1: 0.967

Per-class metrics:
market_question: F1 = 0.971
policy_help: F1 = 0.987
disease_diagnosis: F1 = 1.000
weather_management: F1 = 1.000
other: F1 = 0.875



