## Scraping Plant's Scientific Name and Link to their details

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = "https://neist.res.in/osadhi/chemo.php"
# get headers from "https://www.whatismybrowser.com/"
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

In [3]:
webpage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")

In [11]:
chemo_name_lst = soup.find_all("a", href=True)
chemo_name_lst = chemo_name_lst[12:-9] # slice out unwanted items from the list
chemo_name_lst

[<a href="chemodetail.php?phyto=%09auricularic+acid">	auricularic acid</a>,
 <a href="chemodetail.php?phyto=%09d-glucose">	d-glucose</a>,
 <a href="chemodetail.php?phyto=%09EUDESMANE">	EUDESMANE</a>,
 <a href="chemodetail.php?phyto=%09orientin">	orientin</a>,
 <a href="chemodetail.php?phyto=%09oxalic+acid">	oxalic acid</a>,
 <a href="chemodetail.php?phyto=%09vitexin">	vitexin</a>,
 <a href="chemodetail.php?phyto=%0A15-O-acetylbruceolide">
 15-O-acetylbruceolide</a>,
 <a href="chemodetail.php?phyto=%0Aamataine">
 amataine</a>,
 <a href="chemodetail.php?phyto=%0AClauraila+B">
 Clauraila B</a>,
 <a href="chemodetail.php?phyto=%0Acleistanthin+A">
 cleistanthin A</a>,
 <a href="chemodetail.php?phyto=%0Adelavirdine">
 delavirdine</a>,
 <a href="chemodetail.php?phyto=%0Adicerandrol+C">
 dicerandrol C</a>,
 <a href="chemodetail.php?phyto=%0Agaboroquinone+A">
 gaboroquinone A</a>,
 <a href="chemodetail.php?phyto=%0Agigantetronenin">
 gigantetronenin</a>,
 <a href="chemodetail.php?phyto=%0AGirin

In [14]:
len(chemo_name_lst)

21434

In [15]:
chem_names_chemo_dict = {'Phytochemical': [], 'Link': []}

In [16]:
for chemo_name in chemo_name_lst:
    try:
        chem_names_chemo_dict['Phytochemical'].append(chemo_name.text.strip()) # add scientific name of the plant to dict 
        element = chemo_name.find("a", href=True)
        final_link = str("https://neist.res.in/osadhi/" + chemo_name['href'])
        chem_names_chemo_dict['Link'].append(final_link) # add link of the corresponding plant to dict 
    except TypeError:
        pass

In [17]:
chem_names_chemo_df = pd.DataFrame(chem_names_chemo_dict)
chem_names_chemo_df.head()

Unnamed: 0,Phytochemical,Link
0,auricularic acid,https://neist.res.in/osadhi/chemodetail.php?ph...
1,d-glucose,https://neist.res.in/osadhi/chemodetail.php?ph...
2,EUDESMANE,https://neist.res.in/osadhi/chemodetail.php?ph...
3,orientin,https://neist.res.in/osadhi/chemodetail.php?ph...
4,oxalic acid,https://neist.res.in/osadhi/chemodetail.php?ph...


In [18]:
chem_names_chemo_df.to_csv("OSADHI_Chemo_ChemName_Links.csv", index=False)

# Scraping each plant's specific details

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from tqdm import tqdm
import json

In [3]:
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

In [4]:
url2 = "https://neist.res.in/osadhi/chemodetail.php?phyto=%0Adicerandrol+C"
webpage2 = requests.get(url2, headers=HEADERS)
soup2 = BeautifulSoup(webpage2.content, "html.parser")

In [5]:
main_dropdowns = soup2.find_all("div", class_='card-header')

for i in main_dropdowns:
    print(i.text.strip())

Physiochemical Properties
Medicinal Chemistry Properties
Drug Likeliness
Absorption
Distribution
Metabolism
Excretion
Toxicity
Antiviral Prediction


In [6]:
soup2.find("div", class_='card-body')

<div class="card-body">
<table class="table table-striped">
<tr>
<td width="25%">Molecular Weight</td>
<td width="25%">750.706</td>
<td width="25%">nRot</td>
<td width="25%">7</td>
</tr>
<tr>
<td>Heavy Atom Molecular Weight</td>
<td>712.402</td>
<td>nRig</td>
<td>40</td>
</tr>
<tr>
<td>Exact Molecular Weight</td>
<td>750.216</td>
<td>nRing</td>
<td>6</td>
</tr>
<tr>
<td>Solubility: LogS</td>
<td>-4.399</td>
<td>nHRing</td>
<td>2</td>
</tr>
<tr>
<td>Solubility: LogP</td>
<td>3.08</td>
<td>No. of Aliphatic Rings</td>
<td>4</td>
</tr>
<tr>
<td>Acid Count</td>
<td>0</td>
<td>No. of Aromatic Rings</td>
<td>2</td>
</tr>
<tr>
<td>Base Count</td>
<td>0</td>
<td>No. of Aliphatic Carbocycles Rings</td>
<td>2</td>
</tr>
<tr>
<td>Atoms Count</td>
<td>92</td>
<td>No. of Aliphatic Hetero Cycles</td>
<td>2</td>
</tr>
<tr>
<td>No. of Heavy Atom</td>
<td>54</td>
<td>No. of Aromatic Carbocycles</td>
<td>2</td>
</tr>
<tr>
<td>nHetero</td>
<td>16</td>
<td>No. of Aromatic Hetero Cycles</td>
<td>0</td>
</tr

In [14]:
soup2.find_all('td')

[<td width="25%">Molecular Weight</td>,
 <td width="25%">750.706</td>,
 <td width="25%">nRot</td>,
 <td width="25%">7</td>,
 <td>Heavy Atom Molecular Weight</td>,
 <td>712.402</td>,
 <td>nRig</td>,
 <td>40</td>,
 <td>Exact Molecular Weight</td>,
 <td>750.216</td>,
 <td>nRing</td>,
 <td>6</td>,
 <td>Solubility: LogS</td>,
 <td>-4.399</td>,
 <td>nHRing</td>,
 <td>2</td>,
 <td>Solubility: LogP</td>,
 <td>3.08</td>,
 <td>No. of Aliphatic Rings</td>,
 <td>4</td>,
 <td>Acid Count</td>,
 <td>0</td>,
 <td>No. of Aromatic Rings</td>,
 <td>2</td>,
 <td>Base Count</td>,
 <td>0</td>,
 <td>No. of Aliphatic Carbocycles Rings</td>,
 <td>2</td>,
 <td>Atoms Count</td>,
 <td>92</td>,
 <td>No. of Aliphatic Hetero Cycles</td>,
 <td>2</td>,
 <td>No. of Heavy Atom</td>,
 <td>54</td>,
 <td>No. of Aromatic Carbocycles</td>,
 <td>2</td>,
 <td>nHetero</td>,
 <td>16</td>,
 <td>No. of Aromatic Hetero Cycles</td>,
 <td>0</td>,
 <td>nBridge Head</td>,
 <td>0</td>,
 <td>No. Saturated Carbocycles</td>,
 <td>2</td>,
 

In [32]:
chemo_values = []

In [33]:
inside_info = soup2.find_all('td')

j=0
for i in inside_info:
    if j%2 != 0:
        chemo_values.append(i.text.strip())
    j+=1

In [34]:
for i in range(0, 35):
    print(i)

['750.706',
 '7',
 '712.402',
 '40',
 '750.216',
 '6',
 '-4.399',
 '2',
 '3.08',
 '4',
 '0',
 '2',
 '0',
 '2',
 '92',
 '2',
 '54',
 '2',
 '16',
 '0',
 '0',
 '2',
 '38',
 '0',
 '38',
 '2',
 '0',
 '12',
 '16',
 '12',
 '16',
 '101.63',
 '4',
 '53.7459',
 '0.235',
 '5.581',
 '1.063',
 '0.511',
 'Rejected',
 'Accepted',
 'Rejected',
 'Rejected',
 '0.972',
 '0.869',
 '0.889',
 '-5.293',
 '0.0000537',
 '0.003',
 '0.821678',
 '0.405',
 '0.181937',
 '0.02',
 '0.036',
 '0.035',
 '0.061',
 '0.12',
 '0.24',
 '0.187',
 '0.096',
 '0.576',
 '0.286',
 '1.18',
 '0.024',
 '0.012',
 '0.021',
 '0.999',
 '0.023',
 '0.028',
 '0.017',
 '0.043',
 '0.098',
 'Yes',
 '0.864983']

In [27]:
chemo_items_dict = {
'Physiochemical Properties' : {
    'Molecular Weight':'',
    'nRot':'',
    'Heavy Atom Molecular Weight':'',
    'nRig':'',
    'Exact Molecular Weight':'',
    'nRing':'',
    'Solubility: LogS':'',
    'nHRing':'',
    'Solubility: LogP':'',
    'No. of Aliphatic Rings':'',
    'Acid Count':'',
    'No. of Aromatic Rings':'',
    'Base Count':'',
    'No. of Aliphatic Carbocycles Rings':'',
    'Atoms Count':'',
    'No. of Aliphatic Hetero Cycles':'',
    'No. of Heavy Atom':'',
    'No. of Aromatic Carbocycles':'',
    'nHetero':'',
    'No. of Aromatic Hetero Cycles':'',
    'nBridge Head':'',
    'No. Saturated Carbocycles':'',
    'No. of Hydrogen atom':'',
    'No. of Saturated Hetero Cycles':'',
    'No. of Carbon atom':'',
    'No. of Saturated Rings':'',
    'No. of Nitrogen atom':'',
    'No. of Arom Atom':'',
    'No. of Oxygen atom':'',
    'No. of Arom Bond':'',
    'nHA':'',
    'APOL':'',
    'nHD':'',
    'BPOL':''
    },

'Medicinal Chemistry Properties' : {
    'QED':'',
    'Synth':'',
    'NaturalProduct Likeliness':'',
    'NR-PPAR-gamma':''
    },

'Drug Likeliness' : {
    'Lipinski':'',
    'Pfizer':'',
    'GSK':'',
    'Golden Triangle':''
    },

'Absorption' : {
    'Pgp-inh':'',
    'Pgp-sub':'',
    'HIA':'',
    'CACO-2':''
    },

'Distribution' : {
    'MDCK':'',
    'BBB':'',
    'PPB':'',
    'VDSS':''
    },

'Metabolism' : {
    'FU':'',
    'CYP1A2-inh':'',
    'CYP1A2-sub':'',
    'CYP2c19-inh':'',
    'CYP2c19-sub':'',
    'CYP2c9-inh':'',
    'CYP2c9-sub':'',
    'CYP2d6-inh':'',
    'CYP2d6-sub':'',
    'CYP3a4-inh':'',
    'CYP3a4-sub':''
    },

'Excretion' : {
    'CL':'',
    'T12':''
    },

'Toxicity' : {
    'hERG':'',
    'Ames':'',
    'ROA':'',
    'SkinSen':'',
    'Carcinogencity':'',
    'EI':'',
    'Respiratory':'',
    'NR-Aromatase':''
    },

'Antiviral Prediction' : {
    'Antiviral':'',
    'Prediction':''
    }
}

In [None]:
chemo_items_dict['Antiviral Prediction']['Antiviral'] = 'testt' # adding item to nested dict

In [None]:
chemo_items_dict = {
'Physiochemical Properties' : [],
'Medicinal Chemistry Properties' : [],
'Drug Likeliness' : [],
'Absorption' : [],
'Distribution' : [],
'Metabolism' : [],
'Excretion' : [],
'Toxicity' : [],
'Antiviral Prediction' : []
}

In [None]:
all_plant_info_dict = {}

def get_plant_website_html(plant_link):
    plant_webpage = requests.get(plant_link, headers=HEADERS)
    plant_soup = BeautifulSoup(plant_webpage.content, "html.parser")
    return plant_soup

def add_per_plant_details(soup, per_plant_details_dict):
    per_plant_details_dict = {
        'Summary': [],
        'Statewise availability': [],
        'Phytochemicals': [],
        'Ailments cured': [],
        'Plant parts and method of its use': [],
        'Vernacular name': []
        }
    
    for i in range(0, 6):
        if i == 0:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Summary'].append(j.text)
        if i == 1:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Statewise availability'].append(j.text)
        if i == 2:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Phytochemicals'].append(j.text)
        if i == 3:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Ailments cured'].append(j.text)
        if i == 4:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Plant parts and method of its use'].append(j.text)
        if i == 5:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Vernacular name'].append(j.text)

    return per_plant_details_dict

In [None]:
df = pd.read_csv("OSADHI_Ethno_Scientific_Name_Links.csv")
df.head(3)

In [None]:
loop_range = range(1494, len(df['Scientific Name']))
big_pause = 0

for i in tqdm((loop_range), desc="Processing"):
    plant_name = df['Scientific Name'][i]
    plant_link = df['Link'][i]

    plant_website_html = get_plant_website_html(plant_link) # getting HTML of plant details website
    per_plant_details = add_per_plant_details(plant_website_html, all_plant_info_dict) # collecting plant info to a dict
    all_plant_info_dict[plant_name] = per_plant_details # adding collected plant info to a main dict with plant sci name as key
    
    big_pause += 1
    if big_pause % 50 == 0:
        time.sleep(2) # after continuous 50 runs, long break for server

In [None]:
with open("OSADHI_all_plant_details_json_file_NEW.json", "w") as FinalOut: 
    json.dump(all_plant_info_dict, FinalOut)

# Merging JSONs

In [None]:
with open('OSADHI_all_plant_details_json_file_1.json') as f:
    data1 = json.load(f)
    
with open('OSADHI_all_plant_details_json_file_NEW.json') as f: 
    data2 = json.load(f)

data1.update(data2)

with open('OSADHI_all_plant_details_json_file.json', 'w') as f:
   json.dump(data1, f)

In [None]:
with open('OSADHI_all_plant_details_json_file.json') as f:
    item_dict = json.load(f)

len(item_dict)

In [None]:
sci_name_lst = df['Scientific Name'].tolist()

In [None]:
for key, _ in item_dict.items():
    try:
        sci_name_lst.remove(key)
    except ValueError:
        pass

In [None]:
sci_name_lst

In [None]:
item_dict

# Converting JSON to CSV

In [None]:
import json
import pandas as pd

In [None]:
with open('OSADHI_all_plants.json') as f:
    item_dict = json.load(f)

In [None]:
# item_dict's SUMMARY column DataFrame (df) 

first_entry = list(item_dict.values())[0]
headers = [item.split(':')[0] for item in first_entry['Summary']]  # use first entry to get headers
# headers expected output ['Scientific Name', 'Genus', 'Species', 'Family', 'Synonym']

rows = []

for plant_name, plant_data in item_dict.items():
    summary = plant_data.get('Summary', []) # extracting Summary values
    row = {}
    for idx, header in enumerate(headers):
        # Match summary items with corresponding headers
        row[header] = summary[idx].split(':')[1] if idx < len(summary) else ''

    rows.append(row)

df = pd.DataFrame(rows, columns=headers)
df.head()

Unnamed: 0,Scientific Name,Genus,Species,Family,Synonym
0,Abelmoschus crinitus,Abelmoschus,crinitus,Malvaceae,
1,Abelmoschus esculentus,Abelmoschus,esculentus,Malvaceae,
2,Abelmoschus ficulneus,Abelmoschus,ficulneus,Malvaceae,"Hibiscus ficulneus, Abelmoschus ficulneus,etc."
3,Abelmoschus manihot,Abelmoschus,manihot,Malvaceae,
4,Abelmoschus moschatus,Abelmoschus,moschatus,Malvaceae,


In [None]:
# item_dict's remaining column DataFrame (df1)

df1 = pd.DataFrame.from_dict(item_dict, orient='index')
df1 = df1.map(lambda x: ', '.join(x) if isinstance(x, list) else x)
df1.drop('Summary', axis=1, inplace=True)
df1.head()

Unnamed: 0,Statewise availability,Phytochemicals,Ailments cured,Plant parts and method of its use,Vernacular name
Abelmoschus crinitus,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...",,"Dysentery,","Whole Plant\r\n : Extract Taken Orally,",
Abelmoschus esculentus,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","4-O-BETA-D-GALACTOPYRANOSYL-D-GALACTOSE, 9-HEX...","Antispasmodic, Demulcent\r\n, Diaphoretic\r\n,...","Leaves\r\n : Cooked Taken Orally,","Andhra Pradesh : Benda\r, Karnataka : Bende\r,..."
Abelmoschus ficulneus,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...","Malvalic acid, Sterculic acid, Epoxyoleic acid,","Asthma\r\n, Wound Healing\r\n, Wound And Injur...","Whole Plant\r\n : Extract Taken Orally, Seeds\...","Assam : Dhopattita\r, Maharashtra : etari\r, T..."
Abelmoschus manihot,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","PROTEIN, GLYCOLIPIDS,","Cardioprotective\r\n,","Leaves\r\n : Cooked Taken Orally,",
Abelmoschus moschatus,"Andhra Pradesh, Chattisgarh, Jharkhand, Karnat...","PROTEIN, MALVALIC-ACID, STERCULIC-ACID, TRANS-...","Oxidant\r\n,","Leaves\r\n : Extract Taken Orally, Roots\r\n :...","Jharkhand : Musk okra\r, Karnataka : Latha\r, ..."


In [None]:
# merging both dfs, df and df1

merged_df = pd.concat([df.reset_index(drop=True), df1.reset_index(drop=True)], axis=1)
merged_df.head()

Unnamed: 0,Scientific Name,Genus,Species,Family,Synonym,Statewise availability,Phytochemicals,Ailments cured,Plant parts and method of its use,Vernacular name
0,Abelmoschus crinitus,Abelmoschus,crinitus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...",,"Dysentery,","Whole Plant\r\n : Extract Taken Orally,",
1,Abelmoschus esculentus,Abelmoschus,esculentus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","4-O-BETA-D-GALACTOPYRANOSYL-D-GALACTOSE, 9-HEX...","Antispasmodic, Demulcent\r\n, Diaphoretic\r\n,...","Leaves\r\n : Cooked Taken Orally,","Andhra Pradesh : Benda\r, Karnataka : Bende\r,..."
2,Abelmoschus ficulneus,Abelmoschus,ficulneus,Malvaceae,"Hibiscus ficulneus, Abelmoschus ficulneus,etc.","Andhra Pradesh, Chattisgarh, Maharastra, Odish...","Malvalic acid, Sterculic acid, Epoxyoleic acid,","Asthma\r\n, Wound Healing\r\n, Wound And Injur...","Whole Plant\r\n : Extract Taken Orally, Seeds\...","Assam : Dhopattita\r, Maharashtra : etari\r, T..."
3,Abelmoschus manihot,Abelmoschus,manihot,Malvaceae,,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","PROTEIN, GLYCOLIPIDS,","Cardioprotective\r\n,","Leaves\r\n : Cooked Taken Orally,",
4,Abelmoschus moschatus,Abelmoschus,moschatus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Jharkhand, Karnat...","PROTEIN, MALVALIC-ACID, STERCULIC-ACID, TRANS-...","Oxidant\r\n,","Leaves\r\n : Extract Taken Orally, Roots\r\n :...","Jharkhand : Musk okra\r, Karnataka : Latha\r, ..."


In [None]:
merged_df.to_csv("OSADHI_ethno_all_plant_details.csv", index=False)