# Symptom Based Disease Detection

## - For English

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./dataset/diseases_and_descriptions.csv')
df.head()

Unnamed: 0,Disease/Pest,Description
0,Bunchy Top Virus,"Yellowing of the leaf tips and margins, leaf n..."
1,Sigatoka Leaf Spot,"Small, dark, water-soaked spots that enlarge a..."
2,Fusarium Wilt,"Yellowing and wilting of leaves, vascular disc..."
3,Panama Disease,"Yellowing and wilting of the lower leaves, vas..."
4,Black Sigatoka,"Dark brown spots on the leaves, which enlarge ..."


In [3]:
# size of dataset
df.shape

(11, 2)

In [4]:
# information into dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease/Pest  11 non-null     object
 1   Description   11 non-null     object
dtypes: object(2)
memory usage: 304.0+ bytes


In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')

def preprocess_text(text, language='english'):
    """
    Preprocesses text by removing punctuation, converting to lowercase, and removing extra white spaces.
    Parameters:
        text (str): The text to preprocess.
        language (str): The language of the text. Valid values are 'sinhala' and 'english'.
    Returns:
        str: The preprocessed text.
    """
    try:
        # Validate the language parameter
        if language not in ['sinhala', 'english']:
            raise ValueError("Invalid language parameter. Must be 'sinhala' or 'english'.")

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        if language == 'english':
            # Convert to lowercase
            text = text.lower()

            # Remove extra whitespaces
            text = re.sub(r'\s+', ' ', text)

        return text

    except Exception as e:
        raise ValueError("Error preprocessing text: " + str(e))

def tokenize_text(text, language='english'):
    """
    Tokenizes text by splitting it into individual words and lemmatizes the tokens.
    Lemmatization only done for english texts
    Parameters:
        text (str): The text to tokenize.
        language (str): The language of the text. Valid values are 'sinhala' and 'english'.
    Returns:
        list: A list of tokens.
    """
    try:
        # Validate the language parameter
        if language not in ['sinhala', 'english']:
            raise ValueError("Invalid language parameter. Must be 'sinhala' or 'english'.")

        # Tokenize the text using NLTK
        tokens = word_tokenize(text)

        if language == 'english':
            # Lemmatize the tokens using NLTK's WordNetLemmatizer
            lemmatizer = WordNetLemmatizer()
            # lemmatized tokens
            tokens = [lemmatizer.lemmatize(token) for token in tokens]

        return tokens

    except Exception as e:
        raise ValueError("Error tokenizing text: " + str(e))



def calculate_tfidf_scores(df, symptom, language='english'):
    """
    Calculates the TF-IDF scores for a list of disease descriptions and a symptom.
    Parameters:
        df (pd.DataFrame): A dataframe of disease/pest, descriptions to compare the symptom against.
        symptom (str): The symptom to calculate the TF-IDF scores for.
        language (str): The language of the text. Valid values are 'sinhala' and 'english'.
    Returns:
        list: A list of tuples, where each tuple contains the disease/pest and its corresponding TF-IDF score.
    """
    try:
        # Validate the language parameter
        if language not in ['sinhala', 'english']:
            raise ValueError("Invalid language parameter. Must be 'sinhala' or 'english'.")

        disease_descriptions = df["Description"].tolist()
        # Preprocess the symptom
        symptom = preprocess_text(symptom, language)

        # Tokenize the symptom
        symptom_tokens = tokenize_text(symptom, language)

        # Preprocess the disease descriptions and tokenize them
        preprocessed_descriptions = [preprocess_text(d) for d in disease_descriptions]
        description_tokens = [tokenize_text(d) for d in preprocessed_descriptions]

        # Combine the symptom and disease description tokens
        all_tokens = description_tokens.copy()
        all_tokens.append(symptom_tokens)

        # Convert the tokens to strings for the TF-IDF vectorizer
        all_strings = [' '.join(tokens) for tokens in all_tokens]

        # Calculate the TF-IDF scores
        if language == 'english':
            tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        else:
            tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(all_strings)
        symptom_tfidf = tfidf_matrix[-1]
        description_tfidf = tfidf_matrix[:-1]

        diseases = df['Disease/Pest']

        scores = list(zip(diseases, (symptom_tfidf * description_tfidf.T).A[0]))

        # Sort the scores in descending order
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

        return sorted_scores

    except Exception as e:
        raise ValueError("Error calculating TF-IDF scores: " + str(e))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def find_top_k_diseases(symptoms, df, k=None, language='english', verbose=True):
    """
    Finds the top k diseases/pests from the given dataset that match the given symptoms based on their TF-IDF scores.

    Parameters:
        symptoms (str): A string containing the symptoms.
        df (pd.DataFrame): A pandas DataFrame containing the dataset of diseases/pests and their descriptions.
        k (int): An integer specifying the number of top diseases/pests to return. If None, returns all diseases/pests.
        verbose (bool): A boolean indicating whether to print the scores of all diseases/pests or not.

    Returns:
        A list of tuples, where each tuple contains the name of a disease/pest and its corresponding TF-IDF score.
    """

    # Validate the language parameter
    if language not in ['sinhala', 'english']:
        raise ValueError("Invalid language parameter. Must be 'sinhala' or 'english'.")

    # Calculate the TF-IDF scores
    scores = calculate_tfidf_scores(df, symptoms, language)

    if verbose:
        # Print all diseases/pests with the scores
        for disease, score in scores:
            print(disease, score)

    return scores[:k]

symptom = 'yellowing of the leaf tips and margins'

x = find_top_k_diseases(symptoms=symptom, df=df, language='english', k=3, verbose=False)
for i in x:
    print(i)

('Bunchy Top Virus', 0.6499782563451886)
('Fusarium Wilt', 0.1781062920302058)
('Panama Disease', 0.1604645235952053)


## - For Sinhala

In [7]:
sinhala_symptom = 'පහළ පත්‍ර කහ වීම සහ මැලවීම'

df_sinhala = pd.read_csv('./dataset/diseases_and_descriptions_sinhala.csv')
df_sinhala.head()

Unnamed: 0,Disease/Pest,Description
0,පැනමා රෝගය,"""පහළ පත්‍ර කහ වීම සහ මැලවීම රුධිර නාලවල වර්ණය..."


In [13]:
x = find_top_k_diseases(symptoms=sinhala_symptom, df=df_sinhala, language='sinhala', k=3, verbose=False)
print(x)

[('පැනමා රෝගය', 0.6687318761258385)]
