In [None]:
!unzip -o pesticide_tables.zip -d /content/pesticide_tables/


Archive:  pesticide_tables.zip
  inflating: /content/pesticide_tables/table_1_cleaned.csv  
  inflating: /content/pesticide_tables/table_34_cleaned.csv  
  inflating: /content/pesticide_tables/table_101_cleaned.csv  
  inflating: /content/pesticide_tables/table_48_cleaned.csv  
  inflating: /content/pesticide_tables/table_50_cleaned.csv  
  inflating: /content/pesticide_tables/table_111_cleaned.csv  
  inflating: /content/pesticide_tables/table_74_cleaned.csv  
  inflating: /content/pesticide_tables/table_58_cleaned.csv  
  inflating: /content/pesticide_tables/table_9_cleaned.csv  
  inflating: /content/pesticide_tables/table_12_cleaned.csv  
  inflating: /content/pesticide_tables/table_120_cleaned.csv  
  inflating: /content/pesticide_tables/table_30_cleaned.csv  
  inflating: /content/pesticide_tables/table_113_cleaned.csv  
  inflating: /content/pesticide_tables/table_119_cleaned.csv  
  inflating: /content/pesticide_tables/table_2_cleaned.csv  
  inflating: /content/pesticide_table

## Dataset Description

We use a cleaned CSV file (`final_cleaned_pesticide_data.csv`) containing the following columns:

- **USAGE**: Disease name or use case for the pesticide
- **SUBSTANCE ACTIVE**: The active chemical substance in the pesticide
- **CONC.**: Concentration or dosage information

Missing values are dropped to ensure clean semantic comparisons.


In [None]:
import pandas as pd
import os

# Load the cleaned pesticide data from all tables
output_folder = 'pesticide_tables'  # Make sure this folder contains the cleaned data
pesticide_tables = []

# Iterate over each CSV file in the pesticide_tables folder and load them
for filename in os.listdir(output_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(output_folder, filename)
        df = pd.read_csv(file_path)
        pesticide_tables.append(df)

# Concatenate all pesticide tables into one dataframe
pesticide_data = pd.concat(pesticide_tables, ignore_index=True)

# Remove any leading or trailing whitespaces from column names
pesticide_data.columns = pesticide_data.columns.str.strip()

# Further clean the data by removing rows with missing 'SUBSTANCE ACTIVE' or 'CONC.'
pesticide_data = pesticide_data.dropna(subset=['SUBSTANCE ACTIVE', 'CONC.'], how='all')

# Drop columns with more than 90% missing values
threshold = 0.9
pesticide_data = pesticide_data.dropna(axis=1, thresh=int((1-threshold)*len(pesticide_data)))

# Show the cleaned data after further cleaning
print(pesticide_data.head())

# Save the final cleaned data
pesticide_data.to_csv("final_cleaned_pesticide_data.csv", index=False)
print("Final cleaned pesticide data saved!")


     N°               SUBSTANCE ACTIVE            CONC. T.F.  \
0  23.0  Azoxystrobine +Difenoconazole  200 g/l+125 g/l   SC   
1  24.0    Azoxystrobine +Difeconazole  125 g/l+125 g/l   SC   
2  25.0  Azoxystrobine +difenoconazole  200 g/l+125 g/l   SC   
3  26.0      Azoxystrobin + Flutriafol        25%+12.5%   SC   
4  27.0      Azoxystrobin + Flutriafol        25%+12.5%   SC   

               P.COMM     N°.H.                  SOCIETE  \
0         AMISCORE SC  F.014-22                 PROTAGRI   
1  PRIORI GOLD 250 SC   F.40-21                  SOLAGRI   
2    Black panther SC  F.009-22                FERTITECH   
3      CURAX EXTRA SC  F.009-20  SHARDA CROPCHEM TUNISIE   
4    CURATEX EXTRA SC   F.63-21     AGRIMED DISTRIBUTION   

                     FABRICANT  \
0         NANJING AGROCHEMICAL   
1     SYNGENTA Crop Protection   
2  HANGZHOU TOPAGRO BIOSCIENCE   
3              SHARDA CROPCHEM   
4              SHARDA CROPCHEM   

                                               US

  pesticide_data = pd.concat(pesticide_tables, ignore_index=True)


In [None]:
import pandas as pd
import os

# Load the cleaned pesticide data from all tables
output_folder = 'pesticide_tables'  # Make sure this folder contains the cleaned data
pesticide_tables = []

# Iterate over each CSV file in the pesticide_tables folder and load them
for filename in os.listdir(output_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(output_folder, filename)
        df = pd.read_csv(file_path)
        pesticide_tables.append(df)

# Concatenate all pesticide tables into one dataframe
pesticide_data = pd.concat(pesticide_tables, ignore_index=True)

# Remove any leading or trailing whitespaces from column names
pesticide_data.columns = pesticide_data.columns.str.strip()

# Clean up the column values by stripping leading/trailing whitespaces and removing unnecessary spaces within values
columns_to_clean = ['SUBSTANCE ACTIVE', 'SOCIETE', 'FABRICANT', 'USAGE']  # Add other relevant columns if necessary
for column in columns_to_clean:
    pesticide_data[column] = pesticide_data[column].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces within text
    pesticide_data[column] = pesticide_data[column].str.strip()  # Strip leading/trailing spaces

# Further clean the data by removing rows with missing 'SUBSTANCE ACTIVE' or 'CONC.'
pesticide_data = pesticide_data.dropna(subset=['SUBSTANCE ACTIVE', 'CONC.'], how='all')

# Drop columns with more than 90% missing values
threshold = 0.9
pesticide_data = pesticide_data.dropna(axis=1, thresh=int((1-threshold)*len(pesticide_data)))

# Show the cleaned data after further cleaning
print(pesticide_data.head())

# Save the final cleaned data
pesticide_data.to_csv("final_cleaned_pesticide_data.csv", index=False)
print("✅ Final cleaned pesticide data saved!")


     N°               SUBSTANCE ACTIVE            CONC. T.F.  \
0  23.0  Azoxystrobine +Difenoconazole  200 g/l+125 g/l   SC   
1  24.0    Azoxystrobine +Difeconazole  125 g/l+125 g/l   SC   
2  25.0  Azoxystrobine +difenoconazole  200 g/l+125 g/l   SC   
3  26.0      Azoxystrobin + Flutriafol        25%+12.5%   SC   
4  27.0      Azoxystrobin + Flutriafol        25%+12.5%   SC   

               P.COMM     N°.H.                  SOCIETE  \
0         AMISCORE SC  F.014-22                 PROTAGRI   
1  PRIORI GOLD 250 SC   F.40-21                  SOLAGRI   
2    Black panther SC  F.009-22                FERTITECH   
3      CURAX EXTRA SC  F.009-20  SHARDA CROPCHEM TUNISIE   
4    CURATEX EXTRA SC   F.63-21     AGRIMED DISTRIBUTION   

                     FABRICANT  \
0         NANJING AGROCHEMICAL   
1     SYNGENTA Crop Protection   
2  HANGZHOU TOPAGRO BIOSCIENCE   
3              SHARDA CROPCHEM   
4              SHARDA CROPCHEM   

                                               US

  pesticide_data = pd.concat(pesticide_tables, ignore_index=True)


# 🌿 NLP-Based Pesticide Recommendation System
Using Sentence Transformers and Semantic Similarity for Disease-Pesticide Matching


## 📚 Introduction

This project uses **Natural Language Processing (NLP)** to recommend pesticides based on plant disease names.
Instead of relying on exact string matching, we use **sentence transformers** to understand the semantic meaning of disease descriptions.

The model compares the input query with disease entries in a dataset to find the most relevant pesticide.

This approach:
- Handles synonyms and spelling variations
- Generalizes better to unseen disease descriptions
- Provides scalable and intelligent recommendations


##  Models Used

We test several pre-trained models from the **`sentence-transformers`** library:

- `all-MiniLM-L6-v2`
- `all-mpnet-base-v2`
- `paraphrase-MiniLM-L6-v2`

These models encode sentences into dense vector embeddings that capture meaning beyond word-level similarity.


##  Why NLP?

Traditional keyword matching fails to understand:
- Synonyms (e.g., "blight" vs. "leaf spot")
- Typos or variants
- Paraphrased disease descriptions

Using **transformers**, we capture semantic relationships by comparing vector representations using **cosine similarity**.


## Pesticide Image Retrieval

To enhance the result presentation, we use the **Google Custom Search API** to fetch images for the recommended pesticide.

We bias the search query with words like "pesticide", "product", "bottle", and "label" to retrieve relevant product images.


##  Cosine Similarity

We use **`util.pytorch_cos_sim`** from `sentence-transformers` to measure similarity between:
- The input disease (query)
- Each disease in the dataset

The **cosine similarity score** ranges from -1 to 1:
- 1 = exact match
- 0 = completely unrelated
- Higher score = better semantic match


## Model Evaluation

For each model, we:
1. Encode the input disease
2. Compare it to all disease descriptions in the dataset
3. Select the highest scoring match
4. Display the recommended pesticide, dosage, and image

This lets us compare model performance based on similarity scores.


In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import requests

# Load data
file_path = "final_cleaned_pesticide_data.csv"
pesticide_data = pd.read_csv(file_path)
pesticide_data.columns = pesticide_data.columns.str.strip()

# Essential columns
disease_column = "USAGE"
pesticide_column = "SUBSTANCE ACTIVE"
dosage_column = "CONC."

# Clean missing values
pesticide_data = pesticide_data.dropna(subset=[disease_column, pesticide_column, dosage_column])
disease_texts = pesticide_data[disease_column].astype(str).tolist()
pesticide_texts = pesticide_data[pesticide_column].astype(str).tolist()

# Fetch image
def fetch_pesticide_image(pesticide_name):
    search_url = f"https://www.googleapis.com/customsearch/v1?q={pesticide_name}+pesticide+product+bottle+label&searchType=image&key=AIzaSyDetchriknoHuoUPd-HabMr-O74c6zsVNs&cx=d59ae06d3b6734fb1"
    try:
        response = requests.get(search_url)
        data = response.json()
        if "items" in data:
            return data["items"][0]["link"]
    except Exception as e:
        print(f"Error fetching image: {e}")
    return "No image available."

# Compare different models
models_to_test = {
    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2"
}

input_disease = "Leaf Blight"
results = []

for model_name, model_path in models_to_test.items():
    print(f"Using model: {model_name}")
    model = SentenceTransformer(model_path)

    input_embedding = model.encode(input_disease, convert_to_tensor=True)
    disease_embeddings = model.encode(disease_texts, convert_to_tensor=True)

    similarity_scores = util.pytorch_cos_sim(input_embedding, disease_embeddings)
    best_match_idx = torch.argmax(similarity_scores).item()
    best_score = similarity_scores[0][best_match_idx].item()

    recommended_pesticide = pesticide_data.iloc[best_match_idx][pesticide_column]
    dosage = pesticide_data.iloc[best_match_idx][dosage_column]
    image_url = fetch_pesticide_image(recommended_pesticide)

    results.append({
        "Model": model_name,
        "Recommended Pesticide": recommended_pesticide,
        "Dosage": dosage,
        "Similarity Score": round(best_score, 4),
        "Image URL": image_url
    })

# Display results
results_df = pd.DataFrame(results)
print("\nModel Comparison Table:")
print(results_df[["Model", "Recommended Pesticide", "Dosage", "Similarity Score"]])

🔍 Using model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔍 Using model: all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔍 Using model: paraphrase-MiniLM-L6-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


📊 Model Comparison Table:
                     Model Recommended Pesticide      Dosage  Similarity Score
0         all-MiniLM-L6-v2             Bentazone      480g/l            0.3680
1        all-mpnet-base-v2     Fluazifop-P-Butyl  1 2 5g / l            0.4152
2  paraphrase-MiniLM-L6-v2          Chlortoluron  5 0 0g / L            0.4715


##  Results Summary

We present the recommendation results for each model as a table that includes:
- Model name
- Recommended pesticide
- Dosage
- Similarity score


In [None]:
# Show details of the best model (highest score)
best_result = max(results, key=lambda x: x["Similarity Score"])
print(f"\n Best Model: {best_result['Model']} with score {best_result['Similarity Score']}")
print(f"Recommended Pesticide: {best_result['Recommended Pesticide']}")
print(f"Dosage: {best_result['Dosage']}")
print(f"Image URL: {best_result['Image URL']}")


🏆 Best Model: paraphrase-MiniLM-L6-v2 with score 0.4715
Recommended Pesticide: Chlortoluron
Dosage: 5 0 0g / L
Image URL: https://www.rayfull.net/uploads/202028235/chlorotoluron54210413246.jpg
