In [3]:
!pip install transformers datasets torch scikit-learn evaluate


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [5]:
import pandas as pd

# Read the dataset (update the file name/path as needed)
df = pd.read_csv("/content/Crop and fertilizer dataset.csv")

# Display the first few rows to inspect the data
print(df.head())


  District_Name Soil_color  Nitrogen  Phosphorus  Potassium   pH  Rainfall  \
0      Kolhapur      Black        75          50        100  6.5      1000   
1      Kolhapur      Black        80          50        100  6.5      1000   
2      Kolhapur      Black        85          50        100  6.5      1000   
3      Kolhapur      Black        90          50        100  6.5      1000   
4      Kolhapur      Black        95          50        100  6.5      1000   

   Temperature       Crop Fertilizer                          Link  
0           20  Sugarcane       Urea  https://youtu.be/2t5Am0xLTOo  
1           20  Sugarcane       Urea  https://youtu.be/2t5Am0xLTOo  
2           20  Sugarcane       Urea  https://youtu.be/2t5Am0xLTOo  
3           20  Sugarcane       Urea  https://youtu.be/2t5Am0xLTOo  
4           20  Sugarcane       Urea  https://youtu.be/2t5Am0xLTOo  


In [6]:
def create_prompt(row):
    return (f"In district {row['District_Name']}, for crop {row['Crop']} grown in {row['Soil_color']} soil "
            f"with a temperature of {row['Temperature']}°C and rainfall of {row['Rainfall']} mm, "
            f"nitrogen level {row['Nitrogen']}, phosphorus level {row['Phosphorus']}, potassium level {row['Potassium']}, "
            f"and pH {row['pH']}, what fertilizer should I use?")

# Create a new column 'prompt'
df["prompt"] = df.apply(create_prompt, axis=1)

# Inspect the new prompt column along with the Fertilizer column
print(df[["prompt", "Fertilizer"]].head())


                                              prompt Fertilizer
0  In district Kolhapur, for crop Sugarcane grown...       Urea
1  In district Kolhapur, for crop Sugarcane grown...       Urea
2  In district Kolhapur, for crop Sugarcane grown...       Urea
3  In district Kolhapur, for crop Sugarcane grown...       Urea
4  In district Kolhapur, for crop Sugarcane grown...       Urea


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Fertilizer"])

# View the mapping of fertilizers to labels
fertilizer_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Fertilizer mapping:", fertilizer_mapping)


Fertilizer mapping: {'10:10:10 NPK': 0, '10:26:26 NPK': 1, '12:32:16 NPK': 2, '13:32:26 NPK': 3, '18:46:00 NPK': 4, '19:19:19 NPK': 5, '20:20:20 NPK': 6, '50:26:26 NPK': 7, 'Ammonium Sulphate': 8, 'Chilated Micronutrient': 9, 'DAP': 10, 'Ferrous Sulphate': 11, 'Hydrated Lime': 12, 'MOP': 13, 'Magnesium Sulphate': 14, 'SSP': 15, 'Sulphur': 16, 'Urea': 17, 'White Potash': 18}


In [8]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Use only the prompt and label columns for our task
df_subset = df[["prompt", "label"]]

# Split the dataset
train_df, test_df = train_test_split(df_subset, test_size=0.2, random_state=42)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine them into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print("Training samples:", len(dataset_dict["train"]))
print("Testing samples:", len(dataset_dict["test"]))


Training samples: 3610
Testing samples: 903


In [9]:
from transformers import AutoTokenizer

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Set format for PyTorch tensors (we need input_ids, attention_mask, and label)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Inspect an example
print(tokenized_datasets["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map:   0%|          | 0/903 [00:00<?, ? examples/s]

{'label': tensor(11), 'input_ids': tensor([  101,  1999,  2212,  6369,  3669,  1010,  2005, 10416, 10722, 10867,
        22420,  4961,  1999,  2304,  5800,  2007,  1037,  4860,  1997,  2423,
         7737,  2278,  1998, 10101,  1997, 14883,  3461,  1010, 14114,  2504,
         2423,  1010, 25473,  2504,  2423,  1010, 18044,  2504,  2753,  1010,
         1998,  6887,  1020,  1012,  1014,  1010,  2054, 10768, 28228, 28863,
         2323,  1045,  2224,  1029,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Number of unique fertilizer classes
num_labels = len(label_encoder.classes_)

# Load DistilBERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

from transformers import TrainingArguments

# Modify your training arguments as follows:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increased from 5 to 10 epochs
    learning_rate=1e-5,   # Lower learning rate than before (was 2e-5)
    weight_decay=0.01,
    report_to="none"  # Disable logging integrations like wandb
)
print("Updated training arguments with more epochs and a lower learning rate.")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Updated training arguments with more epochs and a lower learning rate.




In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.399935
2,1.945700,1.1809
3,1.277100,1.104551
4,1.152500,1.073774
5,1.100000,1.049778
6,1.077300,1.058272
7,1.054300,1.051841
8,1.047800,1.031583
9,1.031300,1.029015
10,1.024200,1.024408


TrainOutput(global_step=4520, training_loss=1.1890994282950342, metrics={'train_runtime': 736.3221, 'train_samples_per_second': 49.027, 'train_steps_per_second': 6.139, 'total_flos': 1195880719180800.0, 'train_loss': 1.1890994282950342, 'epoch': 10.0})

In [17]:
# Install evaluate if not already installed
!pip install evaluate

import evaluate
import torch

# Load the accuracy metric
metric = evaluate.load("accuracy")

# Function to compute accuracy on a given dataset
def compute_accuracy(dataset):
    # Use trainer.predict to get model predictions
    predictions = trainer.predict(dataset)
    # Get the predicted class indices
    preds = predictions.predictions.argmax(-1)
    # Compute and return accuracy
    return metric.compute(predictions=preds, references=dataset["label"])["accuracy"]

# Compute accuracy on the training dataset
train_accuracy = compute_accuracy(tokenized_datasets["train"])
# Compute accuracy on the testing dataset
test_accuracy = compute_accuracy(tokenized_datasets["test"])

print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Testing Accuracy: {test_accuracy*100:.2f}%")




Training Accuracy: 48.98%
Testing Accuracy: 44.96%


In [19]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [20]:
import torch.nn.functional as F

def predict_fertilizer(prompt_text):
    # Tokenize the new prompt and move it to the correct device
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Get model output without tracking gradients
    with torch.no_grad():
        outputs = model(**inputs)

    # Compute probabilities using softmax
    probabilities = F.softmax(outputs.logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()

    # Convert numeric label back to fertilizer name
    fertilizer = label_encoder.inverse_transform([predicted_label])[0]
    confidence = probabilities[0, predicted_label].item()
    return fertilizer, confidence

# Example prompt (update as needed)
new_prompt = ("In district Central, for crop Banana grown in Loamy soil with a temperature of 27°C and rainfall of 180 mm, "
              "nitrogen level 32, phosphorus level 18, potassium level 16, and pH 6.7, what fertilizer should I use?")

predicted_fert, conf = predict_fertilizer(new_prompt)
print(f"Predicted Fertilizer: {predicted_fert} (Confidence: {conf:.2%})")


Predicted Fertilizer: Urea (Confidence: 52.56%)
