In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
df1 = pd.read_csv('diseases.csv')
df2 = pd.read_csv('symptom_Description.csv')

In [3]:
df1.head()

Unnamed: 0.1,Unnamed: 0,name,description,symptoms,treatments
0,0,acne,Acne is a common skin condition that occurs wh...,"['Blackheads', 'Whiteheads', 'Pimples', 'Cysts...",['Topical creams and gels containing benzoyl p...
1,1,appendicitis,Appendicitis is a condition where the appendix...,['Pain in the lower right side of the abdomen'...,"['Surgery to remove the appendix', 'Antibiotic..."
2,2,arthritis,Arthritis is a condition where one or more joi...,"['Joint pain and stiffness', 'Swelling and red...","['Pain medication', 'Anti-inflammatory drugs',..."
3,3,asthma,Asthma is a chronic condition where the airway...,"['Wheezing', 'Coughing', 'Shortness of breath'...","['Bronchodilators to open up the airways', 'Co..."
4,4,atherosclerosis,Atherosclerosis is a condition where plaque bu...,"['Chest pain', 'Shortness of breath', 'Weaknes...",['Lifestyle changes such as quitting smoking a...


In [4]:
df1.drop(columns='Unnamed: 0', inplace=True)
df1.rename(columns={'name': 'Disease'}, inplace=True)

In [5]:
df1.shape, df2.shape

((268, 4), (41, 2))

In [6]:
df2.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [7]:
df2['Disease'] = df2['Disease'].str.lower()

In [8]:
merged_df = pd.merge(df1, df2, on='Disease', how='outer')
merged_df.shape

(292, 5)

In [9]:
# Create a new column "Description" and fill it with non-null values from the available columns
merged_df['Description'] = merged_df['Description'].fillna(merged_df['description'])

# Drop unnecessary columns
merged_df.drop(columns=['description'], inplace=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292 entries, 0 to 291
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      292 non-null    object
 1   symptoms     268 non-null    object
 2   treatments   268 non-null    object
 3   Description  292 non-null    object
dtypes: object(4)
memory usage: 11.4+ KB


In [10]:
# Remove brackets, single quotes, and any other characters except commas from "Symptoms" and "Treatments" columns
merged_df['Symptoms'] = merged_df['symptoms'].apply(lambda x: x.strip("[]").lower().replace("'", "").replace("/", "") if isinstance(x, str) else np.nan)
merged_df['Treatments'] = merged_df['treatments'].apply(lambda x: x.strip("[]").lower().replace("'", "").replace("/", "") if isinstance(x, str) else np.nan)
merged_df.head()

Unnamed: 0,Disease,symptoms,treatments,Description,Symptoms,Treatments
0,acne,"['Blackheads', 'Whiteheads', 'Pimples', 'Cysts...",['Topical creams and gels containing benzoyl p...,"Acne vulgaris is the formation of comedones, p...","blackheads, whiteheads, pimples, cysts, nodules",topical creams and gels containing benzoyl per...
1,appendicitis,['Pain in the lower right side of the abdomen'...,"['Surgery to remove the appendix', 'Antibiotic...",Appendicitis is a condition where the appendix...,"pain in the lower right side of the abdomen, n...","surgery to remove the appendix, antibiotics to..."
2,arthritis,"['Joint pain and stiffness', 'Swelling and red...","['Pain medication', 'Anti-inflammatory drugs',...",Arthritis is the swelling and tenderness of on...,"joint pain and stiffness, swelling and redness...","pain medication, anti-inflammatory drugs, phys..."
3,asthma,"['Wheezing', 'Coughing', 'Shortness of breath'...","['Bronchodilators to open up the airways', 'Co...",Asthma is a chronic condition where the airway...,"wheezing, coughing, shortness of breath, chest...","bronchodilators to open up the airways, cortic..."
4,atherosclerosis,"['Chest pain', 'Shortness of breath', 'Weaknes...",['Lifestyle changes such as quitting smoking a...,Atherosclerosis is a condition where plaque bu...,"chest pain, shortness of breath, weakness or n...",lifestyle changes such as quitting smoking and...


In [11]:
disease_df = merged_df.drop(columns=['symptoms', 'treatments'])
disease_df.head()

Unnamed: 0,Disease,Description,Symptoms,Treatments
0,acne,"Acne vulgaris is the formation of comedones, p...","blackheads, whiteheads, pimples, cysts, nodules",topical creams and gels containing benzoyl per...
1,appendicitis,Appendicitis is a condition where the appendix...,"pain in the lower right side of the abdomen, n...","surgery to remove the appendix, antibiotics to..."
2,arthritis,Arthritis is the swelling and tenderness of on...,"joint pain and stiffness, swelling and redness...","pain medication, anti-inflammatory drugs, phys..."
3,asthma,Asthma is a chronic condition where the airway...,"wheezing, coughing, shortness of breath, chest...","bronchodilators to open up the airways, cortic..."
4,atherosclerosis,Atherosclerosis is a condition where plaque bu...,"chest pain, shortness of breath, weakness or n...",lifestyle changes such as quitting smoking and...


In [12]:
corona_descr = 'COVID-19 is the disease caused by a new coronavirus called SARS-CoV-2. It is a kind of virus that causes an infection in your nose, sinuses, or upper throat. The most recently discovered coronavirus causes coronavirus disease COVID-19.'
corona_sympt = '''Fever
Dry cough
Fatigue
Loss of taste or smell
Nasal congestion
Conjunctivitis (also known as red eyes)
Sore throat
Headache
Muscle or joint pain
Different types of skin rash
Nausea or vomiting
Diarrhea
Chills or dizziness'''
corona_symptoms = ', '.join([item.lower() for item in corona_sympt.split('\n')])
corona_treatment = '''Getting enough rest, staying well hydrated, and taking medications to relieve fever and aches and pains.
Communication with a treating physician and promptly report in case of any worsening.
FDA approved medicines like Remdesivir, Paxlovid, Molnupiravir, Tocilizumab'''
corona_treatment = ', '.join([item.lower() for item in corona_treatment.split('\n')])

In [13]:
# Create a dictionary with the data for the new row
new_row = {
    'Disease': 'covid-19',
    'Symptoms': corona_symptoms,
    'Treatments': corona_treatment,
    'Description': corona_descr
}

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame(new_row, index=[0])

# Append the new row to the original DataFrame
disease_df = disease_df.append(new_row_df, ignore_index=True)
disease_df[disease_df['Disease'] == 'covid-19']

  disease_df = disease_df.append(new_row_df, ignore_index=True)


Unnamed: 0,Disease,Description,Symptoms,Treatments
292,covid-19,COVID-19 is the disease caused by a new corona...,"fever, dry cough, fatigue, loss of taste or sm...","getting enough rest, staying well hydrated, an..."


In [14]:
disease_df['Disease'] = disease_df['Disease'].str.strip()

In [15]:
df3= pd.read_csv('disease_symptoms.csv')
df3['Disease'] = df3['Disease'].str.lower().str.strip()
df3.drop_duplicates(subset='Disease', inplace=True)
df3.shape

(41, 18)

In [16]:
for col in df3.columns[1:]:
  df3[col] = df3[col].str.replace('_', ' ')

df3['Symptoms'] = df3[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 
                      'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 
                      'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']].apply(lambda x: ', '.join(x.dropna()), axis=1)

df3 = df3[['Disease', 'Symptoms']]
df3.head()                    

Unnamed: 0,Disease,Symptoms
0,fungal infection,"itching, skin rash, nodal skin eruptions, d..."
10,allergy,"continuous sneezing, shivering, chills, wa..."
20,gerd,"stomach pain, acidity, ulcers on tongue, v..."
30,chronic cholestasis,"itching, vomiting, yellowish skin, nausea, ..."
40,drug reaction,"itching, skin rash, stomach pain, burning m..."


In [17]:
df_precaution = pd.read_csv('symptom_precaution.csv')
for col in df_precaution.columns[1:]:
  df_precaution[col] = df_precaution[col].str.lower()

In [18]:
df_precaution['Disease'] = df_precaution['Disease'].str.lower()
df_precaution['Treatments'] = df_precaution[['Precaution_1', 'Precaution_2', 'Precaution_3',
                                             'Precaution_4']].apply(lambda x: ', '.join(x.dropna()), axis=1)
df_precaution = df_precaution[['Disease', 'Treatments']]                                    

In [19]:
df_precaution.drop_duplicates(subset='Disease', inplace=True)

In [20]:
df_precaution['Disease'] = df_precaution['Disease'].str.strip()

In [21]:
df_precaution.shape, df3.shape

((41, 2), (41, 2))

In [22]:
new_merge = pd.merge(df_precaution, df3, on='Disease', how='outer')
new_merge.drop_duplicates(subset='Disease', inplace=True)
new_merge.shape

(41, 3)

In [23]:
disease_df.shape, new_merge.shape

((293, 4), (41, 3))

In [24]:
big_merge = pd.merge(disease_df, new_merge, on='Disease', how='outer')
big_merge.shape

(294, 6)

In [25]:
big_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294 entries, 0 to 293
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       294 non-null    object
 1   Description   293 non-null    object
 2   Symptoms_x    269 non-null    object
 3   Treatments_x  269 non-null    object
 4   Treatments_y  41 non-null     object
 5   Symptoms_y    41 non-null     object
dtypes: object(6)
memory usage: 16.1+ KB


In [26]:
big_merge['Symptoms'] = big_merge['Symptoms_x'].fillna(big_merge['Symptoms_y'])
big_merge['Treatments'] = big_merge['Treatments_x'].fillna(big_merge['Treatments_y'])
diseases_db = big_merge[['Disease', 'Description', 'Symptoms', 'Treatments']]
diseases_db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294 entries, 0 to 293
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      294 non-null    object
 1   Description  293 non-null    object
 2   Symptoms     293 non-null    object
 3   Treatments   293 non-null    object
dtypes: object(4)
memory usage: 11.5+ KB


In [27]:
diseases_db[diseases_db.isna().any(axis=1)]

Unnamed: 0,Disease,Description,Symptoms,Treatments
278,dimorphic hemorrhoids(piles),"Hemorrhoids, also spelled haemorrhoids, are va...",,
293,dimorphic hemmorhoids(piles),,"constipation, pain during bowel movements, ...","avoid fatty spicy food, consume witch hazel, w..."


In [28]:
diseases_db.loc[293, 'Description'] = 'Hemorrhoids, also spelled haemorrhoids, are vascular structures in the anal canal.'
diseases_db.loc[293, 'Disease'] = 'dimorphic hemorrhoids (piles)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_db.loc[293, 'Description'] = 'Hemorrhoids, also spelled haemorrhoids, are vascular structures in the anal canal.'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_db.loc[293, 'Disease'] = 'dimorphic hemorrhoids (piles)'


In [29]:
diseases_db.dropna(inplace=True)
diseases_db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293 entries, 0 to 293
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      293 non-null    object
 1   Description  293 non-null    object
 2   Symptoms     293 non-null    object
 3   Treatments   293 non-null    object
dtypes: object(4)
memory usage: 11.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_db.dropna(inplace=True)


In [30]:
diseases_db.loc[273, 'Disease'] = 'paroymsal positional vertigo (vertigo)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_db.loc[273, 'Disease'] = 'paroymsal positional vertigo (vertigo)'


In [31]:
diseases_db = diseases_db.sort_values('Disease')
diseases_db

Unnamed: 0,Disease,Description,Symptoms,Treatments
0,acne,"Acne vulgaris is the formation of comedones, p...","blackheads, whiteheads, pimples, cysts, nodules",topical creams and gels containing benzoyl per...
238,adenoiditis,Adenoiditis is an inflammation of the adenoids...,"difficulty breathing through the nose, snoring...","antibiotics, nasal decongestants, surgery to r..."
141,aids,Acquired immunodeficiency syndrome (AIDS) is a...,"fever, fatigue, swollen lymph nodes, rapid wei...","antiretroviral therapy, prophylaxis for opport..."
287,alcoholic hepatitis,"Alcoholic hepatitis is a diseased, inflammator...","vomiting, yellowish skin, abdominal pain, ...","stop alcohol consumption, consult doctor, medi..."
269,allergy,An allergy is an immune system response to a f...,"continuous sneezing, shivering, chills, wa...","apply calamine, cover area with bandage, use i..."
...,...,...,...,...
237,west nile virus infection,West Nile virus (WNV) is a viral infection tha...,"fever, headache, body aches, fatigue, back pai...",there is no specific treatment for wnv infecti...
264,whiplash,Whiplash is a neck injury that occurs when the...,"neck pain and stiffness, headache, shoulder pa...","pain medication, physical therapy, chiropracti..."
265,wilson's disease,Wilson's disease is a rare genetic disorder th...,"fatigue, abdominal pain, jaundice, tremors, di...","chelation therapy, zinc therapy, liver transpl..."
266,yeast infection,A yeast infection is a common fungal infection...,"vaginal itching, burning sensation during urin...","antifungal creams, ointments, or suppositories..."


In [33]:
diseases_db.to_csv('diseases_db.csv', index=False)

In [34]:
disease_list = diseases_db['Disease'].tolist()

In [37]:
import yaml

data = {
    "examples": disease_list,
    "lookup": "disease",
    "nlu": ""
}

with open('diseases.yml', 'w') as file:
  yaml.dump(data, file)