In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
# from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Text Preprocessing Function

def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Removing stopwords and non-alphabetic characters
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)


# Load the dataset

data = pd.read_csv('Symptom2Disease.csv') 

# Displaying the dataset

# print(data)

data.drop(columns=["Unnamed: 0"], inplace=True)

# Concise summary of DataFrame

# print(data.info())

# Check for null values

# print(data.isnull().sum())

# Display column names

# print(data.columns)

# print(data.value_counts())

# Extracting 'label' and 'text' columns from the 'data' DataFrame

labels = data['label']  # Contains the labels or categories associated with the text data
symptoms = data['text']  # Contains the textual data (e.g., symptoms, sentences) for analysis

# Text Preprocessing

stop_words = set(stopwords.words('english'))

# Apply preprocessing to symptoms

preprocessed_symptoms = symptoms.apply(preprocess_text)
# print(preprocessed_symptoms)

# Feature Extraction using TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=1500)  # You can adjust max_features based on your dataset size
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_symptoms).toarray()
# print(f'{tfidf_vectorizer}\n\n{tfidf_features}')



# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

# KNN Model Training

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) based on your dataset
knn_classifier.fit(X_train, y_train)




# Predictions

predictions = knn_classifier.predict(X_test)



# Model Evaluation

accuracy = accuracy_score(y_test, predictions)
# print(f'Accuracy: {accuracy:.2f}')
# print(classification_report(y_test, predictions))


# Example Usage
symptom = input("Enter the symptoms separated by comma: ")

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(symptom)

# Transform the preprocessed symptom using the same vectorizer used during training
symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(symptom_tfidf)
# print(preprocessed_symptom)
print(f'Predicted Disease: {predicted_disease[0]}')


# print(symptom_tfidf)

data = pd.read_csv('ayurvedic_symptoms_desc.csv')

words = symptom.split(", ")

data['common_words'] = data['English_Symptoms'].apply(lambda x: sum(word.lower() in x.lower() for word in words))


# Filter the data based on text similarity
filtered_data = data[data['common_words'] > 0]

# Sort the DataFrame based on the number of common words
filtered_data = filtered_data.sort_values(by='common_words', ascending=False)

# Drop the 'common_words' column as it's no longer needed
filtered_data = filtered_data.drop(columns=['common_words'])

original_data_same_indices = data.loc[filtered_data.index]

# Print or return the data
print(original_data_same_indices)
############################################################################################################################



import pandas as pd

def get_column_values(df, column_name):
    # Get the column values as a list
    column_values = df[column_name].tolist()

    # Convert the list to a string with space separation
    column_values_str = ' '.join(map(str, column_values))

    return column_values_str

df1 = pd.read_csv('Formulation-Indications.csv')

formulations_lst = list(df1['Name of Medicine'])

original_list = list(df1['Main Indications'])

processed_list = []

for item in original_list:
    # Remove spaces and newline characters, convert to lowercase
    processed_item = ''.join(item.split()).lower()
    processed_list.append(processed_item)

# print(processed_list[:5])

# List of lists of symptoms
list_of_symptoms = processed_list

# Flatten the list of lists and split the symptoms using commas and spaces
flat_symptoms = [symptom.replace(',', ' ').split() for symptoms in list_of_symptoms for symptom in symptoms.split(',')]

# Get unique symptoms as a list
unique_symptoms = list(set(symptom for sublist in flat_symptoms for symptom in sublist))

# Print the unique symptoms
# print(unique_symptoms[:5])

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# Create a DataFrame
df = pd.DataFrame(data)

symptoms = pd.read_csv('ayurvedic_symptoms_desc.csv')

symptoms['Symptom'] = symptoms['Symptom'].str.lower()

def symptoms_desc(symptom_name):
    row = symptoms[symptoms['Symptom'] == symptom_name.lower()]
#     print(row)
    if not row.empty:
        description = row.iloc[0]['Description']
        print(f'Description of "{symptom_name}": {description}')
    else:
        print(f'Symptom "{symptom_name}" not found in the DataFrame.')

def symptoms_lst_desc(user_symptoms):
    for item in user_symptoms:
#         print(item)
        symptoms_desc(item)

import difflib

# Your list of correct words (assuming you have a list called unique_symptoms)
correct_words = unique_symptoms

def correct_symptoms(symptoms):
    corrected_symptoms = []
    for symptom in symptoms:
        corrected_symptom = difflib.get_close_matches(symptom, correct_words, n=1, cutoff=0.6)
        if corrected_symptom:
            corrected_symptoms.append(corrected_symptom[0])
        else:
            corrected_symptoms.append(symptom)
    return corrected_symptoms

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the symptom text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Create and train a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_tfidf, df['Formulation'])

# Spelling Correction
user_input = get_column_values(original_data_same_indices, 'Symptom')
print(user_input)
input_symptoms = user_input.split()
new_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(new_symptoms)}")

# Find Symptom Description
symptoms_lst_desc(new_symptoms)

# Predict Formulation 
new_symptoms_tfidf = tfidf_vectorizer.transform(new_symptoms)
predicted_label = clf.predict(new_symptoms_tfidf)
print(f"Predicted Formulation: {predicted_label[0]}")


### Create a boolean mask to filter rows where the second column matches any element in closest_formulations
mask = df1.iloc[:, 0].isin([predicted_label[0]])

# Use the mask to select the rows that match the condition
filtered_df = df1[mask]

# Iterate through the filtered DataFrame and print each row separately
for index, row in filtered_df.iterrows():
    print(row)

Enter the symptoms separated by comma: headache, elevated temperature, weakness, fever, pain in ass
Predicted Disease: Impetigo
              Symptom                                 Description  \
88         Raktapitta  Bleeding disorders related to Pitta dosha.   
120  Raktanishthivana                     Blood vessel disorders.   
86               Daha                          Burning sensation.   
85        Rajayakshma                               Tuberculosis.   
96        Pittajvaram                   Fever due to Pitta dosha.   
..                ...                                         ...   
144      Smritikshaya                             Memory decline.   
146             Gulma                    Abdominal tumor or mass.   
147       Vishmajvara                            Poisonous fever.   
9         Raktadushti                           Blood impurities.   
297  Pittajanetraroga       Urinary disorders due to Pitta dosha.   

                                      Engli

In [14]:
import pandas as pd

def get_column_values(df, column_name):
    # Get the column values as a list
    column_values = df[column_name].tolist()

    # Convert the list to a string with space separation
    column_values_str = ' '.join(map(str, column_values))

    return column_values_str

df1 = pd.read_csv('Formulation-Indications.csv')

formulations_lst = list(df1['Name of Medicine'])

original_list = list(df1['Main Indications'])

processed_list = []

for item in original_list:
    # Remove spaces and newline characters, convert to lowercase
    processed_item = ''.join(item.split()).lower()
    processed_list.append(processed_item)

# print(processed_list[:5])

# List of lists of symptoms
list_of_symptoms = processed_list

# Flatten the list of lists and split the symptoms using commas and spaces
flat_symptoms = [symptom.replace(',', ' ').split() for symptoms in list_of_symptoms for symptom in symptoms.split(',')]

# Get unique symptoms as a list
unique_symptoms = list(set(symptom for sublist in flat_symptoms for symptom in sublist))

# Print the unique symptoms
# print(unique_symptoms[:5])

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# Create a DataFrame
df = pd.DataFrame(data)

symptoms = pd.read_csv('ayurvedic_symptoms_desc.csv')

symptoms['Symptom'] = symptoms['Symptom'].str.lower()

def symptoms_desc(symptom_name):
    row = symptoms[symptoms['Symptom'] == symptom_name.lower()]
#     print(row)
    if not row.empty:
        description = row.iloc[0]['Description']
        print(f'Description of "{symptom_name}": {description}')
    else:
        print(f'Symptom "{symptom_name}" not found in the DataFrame.')

def symptoms_lst_desc(user_symptoms):
    for item in user_symptoms:
#         print(item)
        symptoms_desc(item)

import difflib

# Your list of correct words (assuming you have a list called unique_symptoms)
correct_words = unique_symptoms

def correct_symptoms(symptoms):
    corrected_symptoms = []
    for symptom in symptoms:
        corrected_symptom = difflib.get_close_matches(symptom, correct_words, n=1, cutoff=0.6)
        if corrected_symptom:
            corrected_symptoms.append(corrected_symptom[0])
        else:
            corrected_symptoms.append(symptom)
    return corrected_symptoms

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the symptom text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Create and train a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_tfidf, df['Formulation'])

# Spelling Correction
user_input = get_column_values(original_data_same_indices, 'Symptom')
print(user_input)
input_symptoms = user_input.split()
new_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(new_symptoms)}")

# Find Symptom Description
symptoms_lst_desc(new_symptoms)

# Predict Formulation 
new_symptoms_tfidf = tfidf_vectorizer.transform(new_symptoms)
predicted_label = clf.predict(new_symptoms_tfidf)
print(f"Predicted Formulation: {predicted_label[0]}")


### Create a boolean mask to filter rows where the second column matches any element in closest_formulations
mask = df1.iloc[:, 0].isin([predicted_label[0]])

# Use the mask to select the rows that match the condition
filtered_df = df1[mask]

# Iterate through the filtered DataFrame and print each row separately
for index, row in filtered_df.iterrows():
    print(row)

Raktapitta Raktanishthivana Daha Rajayakshma Pittajvaram Ajirna Sarvajvara Krichhrartav Abhighatajavedanaandvatavikara Sarpadamsha Jirnajvara Karshya Shosha Kustha Kshatakshina Drishtidaurbalya Visarpa Balagraha Klaivya Sutikadosha Svarakshaya Kampa Pinasa Jvaratisara Aptantrak Mutrasthila Bahushosha Katigraha Pama Pakshavadha Pangu Pravahika Sutikaroga Balakshaya Vidradhi Pandu Jvara Pradara Gridhrasi Karnasrava Shirogatavata Urahkshata Dhatukshaya Chhardi Balaroga Vranashotha Pittaroga Suryavarta Gandamala Panduduarbalya Agnimandhya Pittajaroga Raktajroga Raktapradara Manodaurbalya Ojakshya Mukhapaka Jirnapravahika Amatisara Urdhvajatrugataroga Panguvata Aksepa Vandhyaroga Smritikshaya Gulma Vishmajvara Raktadushti Pittajanetraroga
Did you mean: raktapitta, raktanishthivana, daha, rajayakshma, pittajvara, ajirna, sarvajvara, krichhrartav, abhighatajavedanaandvatavikara, sarpadamsha, jirnajvara, karshya, shosha, kustha, kshatakshina, drishtidaurbalya, visarpa, balagraha, klaivya, suti

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
# from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Text Preprocessing Function

def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Removing stopwords and non-alphabetic characters
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)


# Load the dataset

data = pd.read_csv('Symptom2Disease.csv') 

# Displaying the dataset

# print(data)

data.drop(columns=["Unnamed: 0"], inplace=True)

# Concise summary of DataFrame

# print(data.info())

# Check for null values

# print(data.isnull().sum())

# Display column names

# print(data.columns)

# print(data.value_counts())

# Extracting 'label' and 'text' columns from the 'data' DataFrame

labels = data['label']  # Contains the labels or categories associated with the text data
symptoms = data['text']  # Contains the textual data (e.g., symptoms, sentences) for analysis

# Text Preprocessing

stop_words = set(stopwords.words('english'))

# Apply preprocessing to symptoms

preprocessed_symptoms = symptoms.apply(preprocess_text)
# print(preprocessed_symptoms)

# Feature Extraction using TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=1500)  # You can adjust max_features based on your dataset size
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_symptoms).toarray()
# print(f'{tfidf_vectorizer}\n\n{tfidf_features}')



# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

# KNN Model Training

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) based on your dataset
knn_classifier.fit(X_train, y_train)




# Predictions

predictions = knn_classifier.predict(X_test)



# Model Evaluation

accuracy = accuracy_score(y_test, predictions)
# print(f'Accuracy: {accuracy:.2f}')
# print(classification_report(y_test, predictions))


# Example Usage
symptom = input("Enter the symptoms separated by comma: ")

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(symptom)

# Transform the preprocessed symptom using the same vectorizer used during training
symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(symptom_tfidf)
# print(preprocessed_symptom)
print(f'Predicted Disease: {predicted_disease[0]}')


# print(symptom_tfidf)

data = pd.read_csv('ayurvedic_symptoms_desc.csv')

words = symptom.split(", ")

# Filter the data based on text similarity
filtered_data = data[data['English_Symptoms'].apply(lambda x: any(word.lower() in x.lower() for word in words))]

original_data_same_indices = data.loc[filtered_data.index]

# Print or return the data
print(original_data_same_indices)



Enter the symptoms separated by comma: print(original_data_same_indices)
Predicted Disease: Bronchial Asthma
Empty DataFrame
Columns: [Symptom, Description, English_Symptoms]
Index: []


In [7]:
df1

Unnamed: 0,Name of Medicine,Reference text,Dispensing Pack Size,Main Indications,Dose,Precaution/ Contraindication,Preferred use (OPD/ IPD),Class
0,Abhayarishta,AFI,200 ml,"Arsha, Agnimandya,\nUdararoga, Vibandha",12 - 24 ml,NS,Both,A
1,Amritarishta,AFI,200 ml,"SarvaJvara, Jirna Jvara",12 - 24 ml,NS,Both,A
2,Aragvadharishta,AH,200ml,"Kandu, Tvak Vikara,\nVibandha",12 - 24 ml,NS,Both,A
3,Aravindasava,AFI,200 ml,"Balaroga, Balakshaya,\nAgnimandya, Aruchi",12 - 24 ml,NS,Both,A
4,Arjunarishta/ Parthadyarishta,AFI,200 ml,"Hridroga, Hriddrava, Hrid- daurbalya, Moha,\nM...",12 - 24 ml,NS,Both,A
...,...,...,...,...,...,...,...,...
197,Tribhuvankirti Rasa,AFI,5 gm,"Jvara, Pratishyaya, Kasa",125-250 mg,"Bradycar dia, arrhythm ias, small children, Vr...",Both,S
198,Vatagajankusha Rasa,BR,5 gm,"Vata Roga, Avabahuka, Urustambha, Pakshaghata,...",250 mg,"Long term use, Vrikka Roga, peri- concepti ona...",IPD,S
199,Vatavidhavansan Rasa,AFI,5 gm,"Vatajashula, Sutika Vata, Grahaniroga",250 mg,NS,IPD,S
200,Navayasa Lauha,AFI,5 gm,"Pandu, Kamala,\nHridroga",250 mg,NS,Both,T


In [6]:
import pandas as pd

def get_column_values(df, column_name):
    # Get the column values as a list
    column_values = df[column_name].tolist()

    # Convert the list to a string with space separation
    column_values_str = ' '.join(map(str, column_values))

    return column_values_str

df1 = pd.read_csv('Formulation-Indications.csv')

formulations_lst = list(df1['Name of Medicine'])

original_list = list(df1['Main Indications'])

processed_list = []

for item in original_list:
    # Remove spaces and newline characters, convert to lowercase
    processed_item = ''.join(item.split()).lower()
    processed_list.append(processed_item)

# print(processed_list[:5])

# List of lists of symptoms
list_of_symptoms = processed_list

# Flatten the list of lists and split the symptoms using commas and spaces
flat_symptoms = [symptom.replace(',', ' ').split() for symptoms in list_of_symptoms for symptom in symptoms.split(',')]

# Get unique symptoms as a list
unique_symptoms = list(set(symptom for sublist in flat_symptoms for symptom in sublist))

# Print the unique symptoms
# print(unique_symptoms[:5])

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# Create a DataFrame
df = pd.DataFrame(data)

symptoms = pd.read_csv('ayurvedic_symptoms_desc.csv')

symptoms['Symptom'] = symptoms['Symptom'].str.lower()

def symptoms_desc(symptom_name):
    row = symptoms[symptoms['Symptom'] == symptom_name.lower()]
#     print(row)
    if not row.empty:
        description = row.iloc[0]['Description']
        print(f'Description of "{symptom_name}": {description}')
    else:
        print(f'Symptom "{symptom_name}" not found in the DataFrame.')

def symptoms_lst_desc(user_symptoms):
    for item in user_symptoms:
#         print(item)
        symptoms_desc(item)

import difflib

# Your list of correct words (assuming you have a list called unique_symptoms)
correct_words = unique_symptoms

def correct_symptoms(symptoms):
    corrected_symptoms = []
    for symptom in symptoms:
        corrected_symptom = difflib.get_close_matches(symptom, correct_words, n=1, cutoff=0.6)
        if corrected_symptom:
            corrected_symptoms.append(corrected_symptom[0])
        else:
            corrected_symptoms.append(symptom)
    return corrected_symptoms

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the symptom text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Create and train a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_tfidf, df['Formulation'])

# Spelling Correction
user_input = get_column_values(original_data_same_indices, 'Symptom')
print(user_input)
input_symptoms = user_input.split()
new_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(new_symptoms)}")

# Find Symptom Description
symptoms_lst_desc(new_symptoms)

# Predict Formulation 
new_symptoms_tfidf = tfidf_vectorizer.transform(new_symptoms)
predicted_label = clf.predict(new_symptoms_tfidf)
print(f"Predicted Formulation: {predicted_label[0]}")


Jvara
Did you mean: jvara
Description of "jvara": Fever.
Predicted Formulation: Punarnavadi Kashayam


In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

data = pd.read_csv('Symptom2Disease.csv') 
data.drop(columns=["Unnamed: 0"], inplace=True)
labels = data['label']  
symptoms = data['text']  

stop_words = set(stopwords.words('english'))
preprocessed_symptoms = symptoms.apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features=1500)  
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_symptoms).toarray()

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=5)  
knn_classifier.fit(X_train, y_train)

predictions = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

symptom = input("Enter the symptoms separated by comma: ")
preprocessed_symptom = preprocess_text(symptom)
symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])
predicted_disease = knn_classifier.predict(symptom_tfidf)
print(f'Predicted Disease: {predicted_disease[0]}')

data = pd.read_csv('ayurvedic_symptoms_desc.csv')
words = symptom.split(", ")
data['common_words'] = data['English_Symptoms'].apply(lambda x: sum(word.lower() in x.lower() for word in words))
filtered_data = data[data['common_words'] > 0]
filtered_data = filtered_data.sort_values(by='common_words', ascending=False)
filtered_data = filtered_data.drop(columns=['common_words'])
original_data_same_indices = data.loc[filtered_data.index]
print(original_data_same_indices)

Enter the symptoms separated by comma: fever, diarrhea, malnutrition, respiratory infections.
Predicted Disease: Typhoid
                   Symptom                                     Description  \
37                Balaroga                             Pediatric diseases.   
291            Jvaratisara                              Febrile dysentery.   
235               Vidradhi                                        Abscess.   
198               Amashula            Abdominal colic due to Ama (toxins).   
212                  Krimi                              Worm infestations.   
214             Jirnajvara                                  Chronic fever.   
217             Sarvajvara                             All types of fever.   
224  Grahaniroga?pravahika         Irritable bowel syndrome with diarrhea.   
234          Vidagdhajirna  Digestive disorders causing burning sensation.   
252                   Pama                                     Filariasis.   
176            Grahan