<a href="https://colab.research.google.com/github/Ftalxx/Harvest_Shield/blob/main/IPM_data_text_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
from google.colab import files
import pandas as pd

#**Function to get URLs from IPM and access to scrape all valid links with data**

In [None]:
def fetch_page(url):
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def scrape_main_page(main_page_url):
  soup = fetch_page(main_page_url)
  links = soup.find_all('a', href=True)
  valid_links = [link['href'] for link in links if link['href'].startswith('https://ipmdata.ipmcenters.org/source_report.cfm?view=yes&sourceid=')]
  return valid_links


# **Extract location and crop from information table in URL**

In [None]:
def extract_settings_and_region(soup):
    settings = None
    region = None
    tables = soup.find_all('table')

    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 2:
                header = cells[0].get_text(strip=True)
                value = cells[1].get_text(strip=True)
                if header == 'Settings':
                    settings = value
                elif header == 'Region':
                    region = value

    return settings, region

# **Clean pest name and lowercase to normalize**

In [None]:
def clean_pest_name(pest_name):
    cleaned_name = re.sub(r'[^a-z0-9\s]', '', pest_name.lower())
    return cleaned_name.strip()

# **Extract pest | active ingredient | rating data if available**

In [None]:
def extract_pests(soup):
    pests_data = []
    tables = soup.find_all('table', border='1', width='100%')
    for table in tables:
        thead = table.find('thead')
        if thead:
            headers = thead.find_all('th')
            header_texts = [header.get_text(strip=True) for header in headers]
            if 'Pest' in header_texts and 'Active Ingredient' in header_texts and 'Rating' in header_texts:
                tbody = table.find('tbody')
                if tbody:
                    rows = tbody.find_all('tr')
                    pest_index = header_texts.index('Pest')
                    active_ingredient_index = header_texts.index('Active Ingredient')
                    rating_index = header_texts.index('Rating')
                    for row in rows:
                        cells = row.find_all('td')
                        if len(cells) > pest_index:
                            pest_name = cells[pest_index].get_text(strip=True)
                            active_ingredient = cells[active_ingredient_index].get_text(strip=True)
                            rating = cells[rating_index].get_text(strip=True)
                            cleaned_pest_name = clean_pest_name(pest_name)
                            pests_data.append({
                                'Pest': cleaned_pest_name,
                                'Active Ingredient': active_ingredient,
                                'Rating': rating
                            })
    return pests_data

# **Combine into tabular format in CSV**

In [None]:
def write_combined_data_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['Crop', 'Location', 'Pest', 'Active Ingredient', 'Rating'])
        writer.writeheader()
        for entry in data:
            writer.writerow(entry)

Extra if needed - count rows in CSV

In [None]:
def count_rows_in_csv(filename):
    df = pd.read_csv(filename)
    num_rows = df.shape[0]
    print(f'The CSV file has {num_rows} rows.')

# **Main and store to drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def write_combined_data_to_csv(data, filename='combined_data.csv'):
    path = '/content/drive/My Drive/' + filename
    data.to_csv(path, index=False)

In [None]:
def main():
    main_page_url = 'https://ipmdata.ipmcenters.org/source_list.cfm?sourcetypeid=4'
    links = scrape_main_page(main_page_url)

    all_data = []

    for link in links:
        soup = fetch_page(link)
        settings, region = extract_settings_and_region(soup)
        pests_data = extract_pests(soup)

        if pests_data:
            for pest_info in pests_data:
                pest_info['Crop'] = settings
                pest_info['Location'] = region
                all_data.append(pest_info)

    df = pd.DataFrame(all_data)

    # Write the combined data to a CSV file
    write_combined_data_to_csv(df, 'combined_data.csv')

    count_rows_in_csv('/content/drive/My Drive/combined_data.csv')

    print("Combined data has been written to 'combined_data.csv'.")

if __name__ == '__main__':
    main()


The CSV file has 163518 rows.
Combined data has been written to 'combined_data.csv'.


Possibility to download (if drive is not functional)

In [None]:
''' from google.colab import files
files.download('combined_data.csv') '''

" from google.colab import files\nfiles.download('combined_data.csv') "

# **Model (Data Preparation) - Start from here if CSV is in drive**

In [None]:
# @title Kill function to restart runtime
import os
import sys

def restart_runtime():
    os.kill(os.getpid(), 9)

restart_runtime()

In [None]:
!pip install torch

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
from torch.optim.lr_scheduler import ReduceLROnPlateau
from datasets import DatasetDict
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, classification_report

In [None]:

def load_csv_from_drive(filename):
    path = '/content/drive/My Drive/' + filename
    return pd.read_csv(path)

combined_table = load_csv_from_drive('combined_data.csv')

print("Column names in the CSV file:")
print(combined_table.columns)


Column names in the CSV file:
Index(['Pest', 'Active Ingredient', 'Rating', 'Crop', 'Location'], dtype='object')


In [None]:
def create_text_entry(row):
    return (f"Crop: {row['Crop']}\n"
            f"Location: {row['Location']}\n"
            f"Pest/Disease/etc: {row['Pest']}\n"
            f"Solution: {row['Active Ingredient']}\n"
            f"Rating: {row['Rating']}\n")

combined_table = load_csv_from_drive('combined_data.csv')

text_entries = combined_table.apply(create_text_entry, axis=1)

# Save the text data to a file
text_file_path = '/content/drive/My Drive/pest_info.txt'
with open(text_file_path, 'w') as f:
    for entry in text_entries:
        f.write(entry + "\n")


In [None]:
# Read and print the first 5 rows of the text file
with open(text_file_path, 'r') as f:
    lines = f.readlines()
    print(''.join(lines[:5]))

Crop: Pine Tree, Tree Nursery
Location: Southern
Pest/Disease/etc: armyworms
Solution: Permethrin
Rating: Excellent, 90-100%



# **Open text file and reorganize DataFrame**

In [None]:
with open('/content/drive/My Drive/pest_info.txt', 'r') as file:
    lines = file.readlines()

# Process lines into a list of dictionaries
data = []
for i in range(0, len(lines), 6):
    entry = {
        'Crop': lines[i].replace('Crop: ', '').strip(),
        'Location': lines[i+1].replace('Location: ', '').strip(),
        'Pest/Disease/etc': lines[i+2].replace('Pest/Disease/etc: ', '').strip(),
        'Solution': lines[i+3].replace('Solution: ', '').strip(),
        'Rating': lines[i+4].replace('Rating: ', '').strip()
    }
    data.append(entry)

df = pd.DataFrame(data)


df = df.dropna(subset=['Solution'])
df = df[df['Solution'].str.strip() != '']
df = df[df['Solution'].str.strip() != 'nan']
df = df.dropna(subset=['Rating'])
df = df[df['Rating'].str.strip() != '']
df = df[df['Rating'].str.strip() != 'nan']

desired_ratings = [
    'Good to Excellent, 80-100%',
    'Excellent, 90-100%',
    'Good, 80-90%',
    'Good to Excellent, 80-100%',
    'Fair to Good, 50-90%',
    'Fair to Excellent, 50-100%',
    'Excellent, more research needed',
    'highly effective for control of indicated disease',
    'effective for control of indicated disease,',
    'very good'
]

df = df[df['Rating'].isin(desired_ratings)]

df['text'] = df.apply(lambda row: f"Crop: {row['Crop']}\nLocation: {row['Location']}\nPest/Disease/etc: {row['Pest/Disease/etc']}\nSolution: {row['Solution']}\nRating: {row['Rating']}", axis=1)

print(df.head())
print(df.shape)

                      Crop  Location  Pest/Disease/etc    Solution  \
0  Pine Tree, Tree Nursery  Southern         armyworms  Permethrin   
1  Pine Tree, Tree Nursery  Southern  conifer sawflies    Acephate   
2  Pine Tree, Tree Nursery  Southern  conifer sawflies  Bifenthrin   
3  Pine Tree, Tree Nursery  Southern  conifer sawflies    Carbaryl   
4  Pine Tree, Tree Nursery  Southern  conifer sawflies  Cyfluthrin   

               Rating                                               text  
0  Excellent, 90-100%  Crop: Pine Tree, Tree Nursery\nLocation: South...  
1        Good, 80-90%  Crop: Pine Tree, Tree Nursery\nLocation: South...  
2        Good, 80-90%  Crop: Pine Tree, Tree Nursery\nLocation: South...  
3        Good, 80-90%  Crop: Pine Tree, Tree Nursery\nLocation: South...  
4        Good, 80-90%  Crop: Pine Tree, Tree Nursery\nLocation: South...  
(51988, 6)


# **Encode labels - Train and Test**

In [None]:
sample_size = 5000
sampled_df = df.sample(n=sample_size, random_state=42)

if 'spinach' not in sampled_df['Crop'].str.lower().unique():
    spinach_df = df[df['Crop'].str.lower() == 'spinach']

    spinach_sample_size = min(len(spinach_df), 500)
    sampled_spinach_df = spinach_df.sample(n=spinach_sample_size, random_state=42)

    remaining_size = sample_size - spinach_sample_size
    remaining_df = df[df['Crop'].str.lower() != 'spinach']

    sampled_remaining_df = remaining_df.sample(n=remaining_size, random_state=42) if remaining_size > 0 else pd.DataFrame()

    sampled_df = pd.concat([sampled_spinach_df, sampled_remaining_df])
else:
    sampled_df = sampled_df


label_encoder = {label: idx for idx, label in enumerate(df['Solution'].unique())}
sampled_df['label'] = sampled_df['Solution'].map(label_encoder)

train_df, test_df = train_test_split(sampled_df, test_size=0.2, random_state=42)

# Convert the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
unique_plants = train_df['Crop'].unique()

print(unique_plants)


['Alfalfa' 'Chickpea, Lentil, Pea, Dry, Pulse' 'Apple' 'Mint' 'Peanut'
 'Strawberry' 'Pepper' 'Peach' 'Wheat' 'Plum' 'Blueberry' 'Corn, Sweet'
 'Christmas Trees' 'Nectarine' 'Celery' 'Cotton' 'Cherry, Tart'
 'Bean, Snap, Green' 'Cherry' 'Tomato' 'Carrot' 'Grape, Grape, Wine'
 'Grape, Grape, Table' 'Potato' 'Ginseng' 'Pepper, Bell' 'Pear'
 'Watermelon' 'Almond' 'Small Grain' 'Avocado' 'Onion, Onion, Bulb'
 'Banana' 'Pine Tree, Tree Nursery' 'Kiwi' 'Cabbage' 'Cranberry'
 'Sorghum, Grain' 'Asparagus' 'Blackberry' 'Olive' 'Spinach']


# **Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

def tokenize_function(examples):

    tokens = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=120)
    tokens['labels'] = [label_encoder[label] for label in examples['Solution']]

    # Convert numpy arrays to PyTorch tensors
    tokens = {key: torch.tensor(val) for key, val in tokens.items()}
    return tokens

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print(tokenized_train_dataset[0])
print(tokenized_train_dataset.column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'Crop': 'Alfalfa', 'Location': 'Western', 'Pest/Disease/etc': 'alfalfa weevil', 'Solution': 'Post-harvest cultivation', 'Rating': 'Good, 80-90%', 'text': 'Crop: Alfalfa\nLocation: Western\nPest/Disease/etc: alfalfa weevil\nSolution: Post-harvest cultivation\nRating: Good, 80-90%', 'label': 420, '__index_level_0__': 45637, 'input_ids': [101, 10416, 1024, 22989, 10270, 2050, 3295, 1024, 2530, 20739, 1013, 4295, 1013, 4385, 1024, 22989, 10270, 2050, 16776, 14762, 5576, 1024, 2695, 1011, 11203, 13142, 5790, 1024, 2204, 1010, 3770, 1011, 3938, 1003, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# **Initialize pretrained tinyBERT model**

In [None]:
num_labels = len(label_encoder)  # Number of unique pests/diseases
model = AutoModelForSequenceClassification.from_pretrained("Intel/dynamic_tinybert", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Intel/dynamic_tinybert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Customize training process with specific optimizer and learning rate scheduler**

In [None]:
from transformers import Trainer
import torch

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', patience=2, factor=0.1)

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)

        # Extract logits and labels
        logits = outputs.get("logits")
        labels = inputs.get("labels")

        if logits is None:
            raise ValueError("Logits are missing in the output of the model. Check the model configuration.")

        if labels is None:
            raise ValueError("Labels are missing in the input. Ensure your dataset is correctly processed.")

        if isinstance(labels, torch.Tensor):
            labels = labels.to(self.model.device)

        # Calculate loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs):
        loss = super().training_step(model, inputs)
        self.optimizer.step()
        self.scheduler.step(loss)
        return loss


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# **Training arguments to fit data and train**

In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,  # Regularization
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    fp16=True, # Mixed Precision Training
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
    save_steps=1000,
    seed=42,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)

Epoch,Training Loss,Validation Loss,Accuracy
1,6.3042,5.737255,0.025
2,4.944,4.015584,0.376
3,3.57,2.969765,0.568
4,2.748,2.334414,0.72
5,2.2036,1.917354,0.751
6,1.8115,1.611282,0.782
7,1.5187,1.381802,0.821
8,1.2849,1.212362,0.862
9,1.1117,1.077929,0.876
10,0.966,0.981329,0.893


{'eval_loss': 0.6637150645256042, 'eval_accuracy': 0.932, 'eval_runtime': 1.4279, 'eval_samples_per_second': 700.346, 'eval_steps_per_second': 87.543, 'epoch': 20.0}


In [None]:
# Save the model and tokenizer
trainer.save_model("./IPM_model")
tokenizer.save_pretrained("./IPM_model")

('./IPM_model/tokenizer_config.json',
 './IPM_model/special_tokens_map.json',
 './IPM_model/vocab.txt',
 './IPM_model/added_tokens.json',
 './IPM_model/tokenizer.json')

# **Move model to device**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./IPM_model")
tokenizer = AutoTokenizer.from_pretrained("./IPM_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [None]:
crops = set(train_df['Crop'].dropna().str.lower())
pests = set(train_df['Pest/Disease/etc'].dropna().str.lower())
locations = set(train_df['Location'].dropna().str.lower())

# **Test Model with Custom Input**

# **Regular expressions to extract crop, pest, and location from user input**

In [None]:
import re

def extract_info(text):
    crop_pattern = '|'.join(crops)
    pest_pattern = '|'.join(pests)
    location_pattern = '|'.join(locations)

    crop_match = re.search(crop_pattern, text, re.IGNORECASE)
    pest_match = re.search(pest_pattern, text, re.IGNORECASE)
    location_match = re.search(location_pattern, text, re.IGNORECASE)

    crop = crop_match.group(0).lower() if crop_match else 'unknown'
    pest = pest_match.group(0).lower() if pest_match else 'unknown'
    location = location_match.group(0).lower() if location_match else 'unknown'

    return crop, pest, location

# **Function tokenizes the text entry and performs inference using the model and df**

In [None]:
def predict_from_model(crop, location, pest, df, top_n=3):
    text_entry = f"Crop: {crop}\nLocation: {location}\nPest/Disease/etc: {pest}"
    tokens = tokenizer(text_entry, padding='max_length', truncation=True, max_length=120, return_tensors='pt')

    # Move tensors to the same device as the model
    tokens = {key: val.to(device) for key, val in tokens.items()}

    with torch.no_grad():
        outputs = model(**tokens)

    logits = outputs.logits
    top_n_indices = logits.topk(top_n, dim=-1).indices.squeeze().tolist()

    solutions = [df.iloc[i]['Solution'] for i in top_n_indices]
    ratings = [df.iloc[i]['Rating'] for i in top_n_indices]

    return solutions, ratings

# **Top predicted solutions and ratings are printed to the user**

In [None]:
def main():
    while True:
        user_input = input("Describe your situation (or type 'exit' to quit): ").strip()
        if user_input.lower() == 'exit':
            break

        crop, pest, location = extract_info(user_input)

        if crop == 'unknown' or pest == 'unknown' or location == 'unknown':
            print("Unable to extract crop, pest, or location. Please try again with clearer information.")
            continue

        # Predict solutions and ratings
        solutions, ratings = predict_from_model(crop, location, pest, df, top_n=3)

        if not solutions:
            print("No solutions found for the given input. Please try again.")
        else:
            for i, (solution, rating) in enumerate(zip(solutions, ratings), 1):
                print(f"Solution {i}: {solution}")
                print(f"Rating {i}: {rating}")

main()

Describe your situation (or type 'exit' to quit): I have a spinach crop with green peach aphids in the Northeastern region
Solution 1: dichloropropene/chloropicrin
Rating 1: Good, 80-90%
Solution 2: Dichloropropene
Rating 2: Good, 80-90%
Solution 3: Permethrin
Rating 3: Good, 80-90%
Describe your situation (or type 'exit' to quit): exit
