In [12]:
import pandas as pd
import json
from collections import Counter
import torch 

In [3]:
df = pd.read_csv('../data/Advisor_data/final_descriptive.csv')
df.head()

Unnamed: 0,Disease ID,Disease Name,Affected Plant Species,Symptom Description,Diagnosis Method,Treatment Options
0,1,Powdery Mildew,Cucumber; Zucchini; Grapes,Powdery mildew is characterized by the appeara...,The disease is primarily diagnosed through vis...,"To control powdery mildew, sulfur-based fungic..."
1,2,Downy Mildew,Cucumbers; Lettuce; Grapes,Downy mildew manifests as yellowish patches on...,Diagnosis is typically done through careful vi...,Copper-based fungicides or organic treatments ...
2,3,Leaf Spot,Tomatoes; Potatoes; Lettuce,"Leaf spot disease causes small, round lesions ...",Diagnosis is typically based on visual inspect...,Use fungicides or bactericides depending on th...
3,4,Root Rot,Tomatoes; Lettuce; Cucumbers,Root rot leads to wilting and yellowing of the...,Diagnosed by examining the roots for signs of ...,Improve soil drainage and avoid over-watering....
4,5,Late Blight,Potatoes; Tomatoes,"Late blight results in dark, water-soaked lesi...",Diagnosis is typically confirmed by observing ...,Use fungicides containing copper or metalaxyl ...


In [4]:
def format_species(species):
    # Split the string into a list of words by ';' and remove extra spaces
    species_list = [s.strip() for s in species.split(';')]
    
    # If there's only one item, return it in singular form
    if len(species_list) == 1:
        return f"It affects the plants such as {species_list[0]}"
    
    # If multiple items, join them with commas and "and" before the last item
    species_str = ', '.join(species_list[:-1]) + f" and {species_list[-1]}"
    return f"It affects the plants such as {species_str}"

In [6]:
json_data = []

for index, row in df.iterrows():
    
    input_text = f"Which plants are affected by {row['Disease Name']}?"
    response = format_species(row['Affected Plant Species'])
    
    json_data.append({
        "input": input_text,
        "response": response
    })

    input_text = f"What are the diagnosis methods for {row['Disease Name']}?"
    response = row['Diagnosis Method']

    json_data.append({
        'input': input_text,
        'response':response
        
    })
    
    input_text = f"What are the symptons of {row['Disease Name']}?"
    response = row['Symptom Description']

    json_data.append({
        'input': input_text,
        'response':response
        
    })
    
    input_text = f"What are the treatment options for {row['Disease Name']}?"
    response = row['Treatment Options']
    
    json_data.append({
        "input": input_text,
        "response": response
    })

print(len(json_data))

df_json = pd.DataFrame(json_data)

df_json.to_csv('descriptive_csv.csv',index=False)

print('CSV has been successfully created')
    

1556
CSV has been successfully created


In [55]:
r_df = pd.read_csv('./descriptive_csv.csv')
r_df.head()

Unnamed: 0,input,response
0,Which plants are affected by Powdery Mildew?,"It affects the plants such as Cucumber, Zucchi..."
1,What are the diagnosis methods for Powdery Mil...,The disease is primarily diagnosed through vis...
2,What are the symptons of Powdery Mildew?,Powdery mildew is characterized by the appeara...
3,What are the treatment options for Powdery Mil...,"To control powdery mildew, sulfur-based fungic..."
4,Which plants are affected by Downy Mildew?,"It affects the plants such as Cucumbers, Lettu..."


In [58]:
r_df['response'][1]

'The disease is primarily diagnosed through visual inspection, where the white, powdery fungal growth is easily identifiable. Microscopic examination can confirm the presence of fungal spores. In some cases, laboratory culturing techniques may be used to isolate and identify the specific pathogen responsible for the infection.'

In [10]:
r_df.isnull().sum()

input       0
response    0
dtype: int64

In [13]:
def tokenize(text):
    return text.lower().split()


r_df['input_tokens'] = r_df['input'].apply(tokenize)
r_df['response_tokens'] = r_df['response'].apply(tokenize)

In [15]:
# Build vocabulary
all_tokens= [token for tokens in r_df['input_tokens'] for token in tokens] + \
            [token for tokens in r_df['response_tokens'] for token in tokens]

vocab = Counter(all_tokens)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)


# Create word-to-word and index-to-word mappings
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

In [17]:
def tokens_to_indices(tokens, word2idx):
    return [word2idx[token] for token in tokens]

r_df['input_indices'] = r_df['input_tokens'].apply(lambda x: tokens_to_indices(x, word2idx))
r_df['response_indices'] = r_df['response_tokens'].apply(lambda x: tokens_to_indices(x, word2idx))

In [19]:
from torch.nn.utils.rnn import pad_sequence

# Pad sequences
input_sequences = [torch.tensor(seq) for seq in r_df['input_indices']]
response_sequences = [torch.tensor(seq) for seq in r_df['response_indices']]

input_padded = pad_sequence(input_sequences, batch_first=True, padding_value=0)
response_padded = pad_sequence(response_sequences, batch_first=True, padding_value=0)

In [21]:
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.decoder = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_seq, target_seq=None):
        # Embed input sequence
        embedded = self.embedding(input_seq)

        # Encode input sequence
        _, hidden = self.encoder(embedded)

        # Decode sequence
        if target_seq is not None:
            embedded_target = self.embedding(target_seq)
            output, _ = self.decoder(embedded_target, hidden)
            output = self.fc(output)
            return output
        else:
            # For inference, generate tokens one by one
            output_tokens = []
            current_token = torch.tensor([[word2idx['<start>']]], device=input_seq.device)
            for _ in range(50):  # Max length of response
                embedded_token = self.embedding(current_token)
                output, hidden = self.decoder(embedded_token, hidden)
                output = self.fc(output)
                predicted_token = output.argmax(dim=-1)
                output_tokens.append(predicted_token.item())
                current_token = predicted_token.unsqueeze(0)
                if predicted_token.item() == word2idx['<end>']:
                    break
            return output_tokens

In [22]:
from torch.utils.data import DataLoader, TensorDataset

# Create dataset
dataset = TensorDataset(input_padded, response_padded)

# Create DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [23]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 256

# Initialize model
model = Seq2Seq(vocab_size, embedding_dim, hidden_dim)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [24]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for input_batch, target_batch in dataloader:
        optimizer.zero_grad()

        # Forward pass
        output = model(input_batch, target_batch[:, :-1])
        loss = criterion(output.view(-1, vocab_size), target_batch[:, 1:].reshape(-1))

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")

Epoch 1/10, Loss: 5.399273278761883
Epoch 2/10, Loss: 3.3250484709837
Epoch 3/10, Loss: 2.464319744888617
Epoch 4/10, Loss: 1.9902333872658866
Epoch 5/10, Loss: 1.6919951755173352
Epoch 6/10, Loss: 1.456469097915961
Epoch 7/10, Loss: 1.283072946022968
Epoch 8/10, Loss: 1.1536807563840126
Epoch 9/10, Loss: 1.0410142991007592
Epoch 10/10, Loss: 0.9455777844604181


In [25]:
def generate_response(model, input_text):
    model.eval()
    tokens = tokenize(input_text)
    indices = tokens_to_indices(tokens, word2idx)
    input_seq = torch.tensor([indices], dtype=torch.long)

    with torch.no_grad():
        output_tokens = model(input_seq)

    response = ' '.join([idx2word[idx] for idx in output_tokens])
    return response

# Example usage
input_text = "Which plants are affected by Powdery Mildew?"
response = generate_response(model, input_text)
print(response)

KeyError: '<start>'

# Hugging face 

In [1]:
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from transformers import pipeline, set_seed
from datasets import Dataset
from transformers import BertTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments,Trainer

In [3]:
df = pd.read_csv('./descriptive_csv.csv')
df.head()

Unnamed: 0,input,response
0,Which plants are affected by Powdery Mildew?,"It affects the plants such as Cucumber, Zucchi..."
1,What are the diagnosis methods for Powdery Mil...,The disease is primarily diagnosed through vis...
2,What are the symptons of Powdery Mildew?,Powdery mildew is characterized by the appeara...
3,What are the treatment options for Powdery Mil...,"To control powdery mildew, sulfur-based fungic..."
4,Which plants are affected by Downy Mildew?,"It affects the plants such as Cucumbers, Lettu..."


In [4]:
train_data,test_data = train_test_split(df,test_size=0.2)

In [5]:
# Convert the pandas DataFrame to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_data[['input', 'response']])
test_dataset = Dataset.from_pandas(test_data[['input', 'response']])


In [6]:
train_dataset

Dataset({
    features: ['input', 'response', '__index_level_0__'],
    num_rows: 1244
})

In [39]:
# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'  
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 

In [60]:
# tokenizer.pad_token = tokenizer.eos_token

# def tokenizer_function(examples):
#     return tokenizer(examples['input'], truncation = True, padding='max_length', max_length=512)

# train_dataset = train_dataset.map(tokenizer_function, batched=True)
# test_dataset = test_dataset.map(tokenizer_function, batched=True)
# Tokenize the dataset

def tokenize_function(examples):
    # Combine input and response into a single string for training
    combined_text = [f"{inp} {tokenizer.eos_token} {resp}" for inp, resp in zip(examples['input'], examples['response'])]
    print(combined_text)
    tokenized = tokenizer(combined_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].clone()  # Labels are the same as input_ids for next-token prediction
    return tokenized

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1244 [00:00<?, ? examples/s]

['What are the symptons of Sooty Mold? <|endoftext|> Sooty mold causes black, sooty fungal growth on leaves, often due to insect secretions. The mold does not directly infect the plant but blocks sunlight, reducing photosynthesis. Infected plants may experience reduced growth and vigor.', 'What are the diagnosis methods for Walnut Moth? <|endoftext|> Diagnosis is made through visual inspection of the feeding damage and holes in nuts. Insect traps can be used to monitor and confirm the presence of the moth.', 'What are the diagnosis methods for Bacterial Wilt? <|endoftext|> Diagnosed by observing bacterial ooze and PCR tests.', 'What are the diagnosis methods for Leaf Scald? <|endoftext|> Diagnosed by observing symptoms and bacterial culture.', "What are the symptons of Bacterial Canker? <|endoftext|> Bacterial canker causes wilting and brown streaks on stems. Fruits may develop white spots with dark centers, often referred to as bird's-eye spots. It is caused by *Clavibacter michiganen

Map:   0%|          | 0/312 [00:00<?, ? examples/s]

['Which plants are affected by Watermelon Fruit Fly? <|endoftext|> It affects the plants such as Watermelons', 'Which plants are affected by Phomopsis Cane Blight? <|endoftext|> It affects the plants such as Grapes', 'What are the treatment options for Blossom Blight? <|endoftext|> Apply fungicides such as captan or myclobutanil to control the spread of the disease. Prune and remove infected flowers and fruit to reduce spore load. Improve air circulation around the trees and avoid overhead watering to reduce humidity.', 'Which plants are affected by Apple Powdery Mildew? <|endoftext|> It affects the plants such as Apple', 'Which plants are affected by Cherry Leaf Spot? <|endoftext|> It affects the plants such as Cherries', 'What are the diagnosis methods for Verticillium Wilt? <|endoftext|> Diagnosed through visual symptoms and vascular discoloration. Laboratory culturing and microscopic examination can confirm the presence of *Verticillium* fungi. Soil tests may help identify the path

In [41]:
# # Modify the inputs and labels for language modeling (shift labels for next-token prediction)
# def shift_labels(batch):
#     # Shift the labels by one token
#     batch["labels"] = batch["input_ids"].copy()
#     batch["labels"] = [ids[1:] + [tokenizer.pad_token_id] for ids in batch["labels"]]  # Shift labels
#     return batch

# train_dataset = train_dataset.map(shift_labels, batched=True)
# test_dataset = test_dataset.map(shift_labels, batched=True)

In [42]:
train_dataset

Dataset({
    features: ['input', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1244
})

In [43]:
# train_dataset['labels']

In [44]:
training_args = TrainingArguments(
    output_dir='./gpt2_chat_model/',
    eval_strategy= 'epoch',
    learning_rate=5e-5,
    per_device_eval_batch_size= 4,
    per_gpu_eval_batch_size= 4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub= False
)

In [45]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset= train_dataset,
    eval_dataset= test_dataset
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4664,0.419637
2,0.3826,0.361668
3,0.2829,0.344676


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

TrainOutput(global_step=468, training_loss=0.4826259032273904, metrics={'train_runtime': 111.3998, 'train_samples_per_second': 33.501, 'train_steps_per_second': 4.201, 'total_flos': 243785465856000.0, 'train_loss': 0.4826259032273904, 'epoch': 3.0})

In [47]:
# Save the trained model and tokenizer
trainer.save_model('./gpt2_finetuned_model')  # Save model weights and config
tokenizer.save_pretrained('./gpt2_finetuned_model')  # Save tokenizer

('./gpt2_finetuned_model\\tokenizer_config.json',
 './gpt2_finetuned_model\\special_tokens_map.json',
 './gpt2_finetuned_model\\vocab.json',
 './gpt2_finetuned_model\\merges.txt',
 './gpt2_finetuned_model\\added_tokens.json')

In [48]:
# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)
print(f"Evaluation results: {results}")

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Evaluation results: {'eval_loss': 0.34467554092407227, 'eval_runtime': 2.6356, 'eval_samples_per_second': 118.379, 'eval_steps_per_second': 29.595, 'epoch': 3.0}


In [59]:
def chat_with_model(model, tokenizer, input_text, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
    # Append a separator token to the input text
    input_text = f"{input_text} {tokenizer.eos_token}"

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt")

    # Ensure the model is on the correct device (GPU or CPU)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate a response
    output = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],  # Pass attention_mask
        max_length=max_length,                   # Increase max_length for longer responses
        num_return_sequences=1,
        temperature=temperature,  # Adjust temperature for more varied responses
        top_k=top_k,              # Limit sampling to top-k tokens
        top_p=top_p,              # Use nucleus sampling
        do_sample=True,           # Enable sampling (instead of greedy decoding)
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
    )

    # Decode the output text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Remove the input text from the generated response
    generated_text = generated_text[len(input_text):].strip()
    
    return generated_text

# Example usage
input_text = "What are the diagnosis methods for Powdery Mildew?"
response = chat_with_model(model, tokenizer, input_text)
print("Generated Response:")
print(response)

Generated Response:
made through visual inspection of the yellowing and mold growth. Laboratory tests can confirm the presence of the fungal pathogen.
