# Generate synthetic data for training

In [1]:
from mistralai import Field, Mistral
import os
from prompts import get_prompt_for_theme
import asyncio
from tqdm import tqdm
import random
import nest_asyncio
from pydantic import BaseModel
from typing import List
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

MISTRAL_API_KEY=os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=MISTRAL_API_KEY)

class Questions(BaseModel):
    questions: List[str] = Field(..., description="Une liste de questions à propos d'un thème donné")


async def _generate_queries(theme):
    await asyncio.sleep(random.uniform(0, 10))

    prompt = get_prompt_for_theme(theme)

    response = await client.chat.parse_async(
        model="mistral-large-latest",
        messages=[
            {"role": "user", "content": prompt}
        ],
        response_format=Questions
    )
    return {"questions":response.choices[0].message.parsed.questions, "theme": theme}

async def main():
    themes = ["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]
    all_tasks = []

    for theme in themes:
        print(f"Setting up tasks for theme: {theme}")
        all_tasks.extend([_generate_queries(theme) for _ in range(20)])
    
    total_tasks = len(all_tasks)
    print(f"Processing {total_tasks} tasks in batches of 20")
    
    results = []
    batch_size = 20

    for i in range(0, total_tasks, batch_size):
        batch_tasks = all_tasks[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} ({len(batch_tasks)} tasks)")
        
        try:
            batch_results = await tqdm_asyncio.gather(*batch_tasks)
            results.extend(batch_results)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            continue



    return results

q_theme_df = asyncio.run(main())

q_theme_df

Setting up tasks for theme: other
Processing 20 tasks in batches of 20
Processing batch 1 (20 tasks)


100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


[{'questions': ['Quelles sont les meilleures pratiques pour la rotation des cultures ?',
   "Comment optimiser l'irrigation pour économiser l'eau ?",
   "Quels sont les avantages et les inconvénients de l'agriculture biologique ?",
   'Comment gérer les parasites et les maladies de manière écologique ?',
   'Quelles sont les techniques de conservation des sols les plus efficaces ?',
   'Comment améliorer la fertilité du sol naturellement ?',
   "Quels sont les impacts du changement climatique sur l'agriculture locale ?",
   "Comment intégrer l'agroforesterie dans une exploitation agricole ?",
   'Quelles sont les méthodes de compostage les plus efficaces pour les déchets agricoles ?',
   'Comment utiliser les drones pour surveiller les cultures ?'],
  'theme': 'other'},
 {'questions': ['Quelles sont les meilleures pratiques pour la gestion des sols en agriculture biologique ?',
   "Comment optimiser l'irrigation pour réduire la consommation d'eau ?",
   "Quels sont les avantages et les

# Evaluate base model

In [None]:
import pandas as pd

question_theme_pairs_df = pd.read_csv("question_theme_pairs.csv")
print(f"Number of lines before removing duplicates : {len(question_theme_pairs_df)}")


question_theme_pairs_df = question_theme_pairs_df.drop_duplicates(subset=['question', 'theme'])

print(f"Number of lines : {len(question_theme_pairs_df)}")
print(question_theme_pairs_df['theme'].value_counts())

question_theme_pairs_df

Number of lines before removing duplicates : 3194
Number of lines : 2254
theme
market_question       1118
policy_help            382
disease_diagnosis      378
weather_management     206
other                  170
Name: count, dtype: int64


Unnamed: 0,question,theme
0,Quelles sont les meilleures pratiques pour la ...,other
1,Comment optimiser l'irrigation pour économiser...,other
2,Quels sont les avantages et les inconvénients ...,other
3,Comment gérer les parasites et les maladies de...,other
4,Quelles sont les techniques de conservation de...,other
...,...,...
3188,Dois-je protéger mes vignes contre le vent for...,weather_management
3189,Les conditions météorologiques sont-elles appr...,weather_management
3190,Y aura-t-il des orages cette semaine qui pourr...,weather_management
3192,Dois-je récolter mes pommes de terre avant les...,weather_management


<h3> We have too much market_question compared to the other classes, let's drop some </h3>

In [24]:
market_question_indices = question_theme_pairs_df[question_theme_pairs_df['theme'] == 'market_question'].index
indices_to_drop = market_question_indices.to_series().sample(n=600).index

question_theme_pairs_df = question_theme_pairs_df.drop(indices_to_drop).reset_index(drop=True)

print(f"Dropped {len(indices_to_drop)} 'market_question' questions.")
print(question_theme_pairs_df['theme'].value_counts())

Dropped 600 'market_question' questions.
theme
market_question       518
policy_help           382
disease_diagnosis     378
weather_management    206
other                 170
Name: count, dtype: int64


In [25]:
from typing import Literal
from pydantic import BaseModel

class Theme(BaseModel):
    theme: Literal["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]    

In [26]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    question_theme_pairs_df,
    test_size=0.2,
    stratify=question_theme_pairs_df['theme']
)

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print("Train class distribution:\n", train_df['theme'].value_counts())
print("Test class distribution:\n", test_df['theme'].value_counts())

Train set size: 1323
Test set size: 331
Train class distribution:
 theme
market_question       414
policy_help           306
disease_diagnosis     302
weather_management    165
other                 136
Name: count, dtype: int64
Test class distribution:
 theme
market_question       104
policy_help            76
disease_diagnosis      76
weather_management     41
other                  34
Name: count, dtype: int64


# Evaluation of ministral-3b

In [None]:
from tqdm import tqdm
import time

def predict(question, model):    
    prompt = (
        "Classify the following question into one of the following themes: "
        "'market_question', policy_help, disease_diagnosis, weather_management, other.\n\n"
        f"Question: {question}\n\nTheme:"
    )
    response = client.chat.parse(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format=Theme,
    )
    return response.choices[0].message.parsed.theme

def evaluate_model(test_data_df, model):
    y_true = test_data_df["theme"].tolist()
    y_pred = []
    for i, q in enumerate(tqdm(test_data_df["question"], desc="Evaluating")):
        if i > 0 and i % 5 == 0: # Sleep every 5 iterations to avoid rate limiting
            time.sleep(2)
        y_pred.append(predict(q, model))

    themes = ["market_question", "policy_help", "disease_diagnosis", "weather_management", "other"]
    metrics = {}
    for theme in themes:
        tp = sum((yt == theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fp = sum((yt != theme) and (yp == theme) for yt, yp in zip(y_true, y_pred))
        fn = sum((yt == theme) and (yp != theme) for yt, yp in zip(y_true, y_pred))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        metrics[theme] = {"precision": precision, "recall": recall, "f1": f1}
    macro_avg = {
        k: sum(metrics[t][k] for t in themes) / len(themes)
        for k in ["precision", "recall", "f1"]
    }
    return {"per_class": metrics, "macro_avg": macro_avg}

results = evaluate_model(test_df, model="ministral-3b-latest")

print("Model evaluation results:")
print(f"Macro Average: Precision: {results['macro_avg']['precision']:.3f}, Recall: {results['macro_avg']['recall']:.3f}, F1: {results['macro_avg']['f1']:.3f}")
print("\nPer-class metrics:")
for theme, m in results['per_class'].items():
    print(f"{theme}: F1 = {m['f1']:.3f}")


Evaluating:   0%|          | 0/331 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 331/331 [04:13<00:00,  1.30it/s]

Model evaluation results:
Macro Average: Precision: 0.801, Recall: 0.821, F1: 0.809

Per-class metrics:
market_question: F1 = 0.802
policy_help: F1 = 0.818
disease_diagnosis: F1 = 0.955
weather_management: F1 = 0.920
other: F1 = 0.551



