In [None]:
!pip install pandas
!pip install lxml
!pip install tqdm

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Scraping Plant's Scientific Name and Link to their details

In [4]:
url = "https://neist.res.in/osadhi/ethno.php"
# get headers from "https://www.whatismybrowser.com/"
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

In [5]:
webpage = requests.get(url, headers=HEADERS)

In [6]:
soup = BeautifulSoup(webpage.content, "html.parser")

In [7]:
sci_name_lst = soup.find_all("a", href=True)
sci_name_lst = sci_name_lst[12:-9] # slice out unwanted items from the list
sci_name_lst

[<a href="detail.php?name=Abelmoschus+crinitus"><i>Abelmoschus crinitus</i></a>,
 <a href="detail.php?name=Abelmoschus+esculentus"><i>Abelmoschus esculentus</i></a>,
 <a href="detail.php?name=Abelmoschus+ficulneus"><i>Abelmoschus ficulneus</i></a>,
 <a href="detail.php?name=Abelmoschus+manihot"><i>Abelmoschus manihot</i></a>,
 <a href="detail.php?name=Abelmoschus+moschatus"><i>Abelmoschus moschatus</i></a>,
 <a href="detail.php?name=Abies+alba"><i>Abies alba</i></a>,
 <a href="detail.php?name=Abies+balsamea"><i>Abies balsamea</i></a>,
 <a href="detail.php?name=Abies+canadensis"><i>Abies canadensis</i></a>,
 <a href="detail.php?name=Abies+densa"><i>Abies densa</i></a>,
 <a href="detail.php?name=Abies+grandis"><i>Abies grandis</i></a>,
 <a href="detail.php?name=Abies+nephrolepis"><i>Abies nephrolepis</i></a>,
 <a href="detail.php?name=Abies+nigra"><i>Abies nigra</i></a>,
 <a href="detail.php?name=Abies+pindrow"><i>Abies pindrow</i></a>,
 <a href="detail.php?name=Abies+pinsapo"><i>Abies p

In [8]:
sci_names_ethno_dict = {'Scientific Name': [], 'Link': []}

In [9]:
for sci_name in sci_name_lst:
    try:
        sci_names_ethno_dict['Scientific Name'].append(sci_name.text) # add scientific name of the plant to dict 
        element = sci_name.find("a", href=True)
        final_link = str("https://neist.res.in/osadhi/" + sci_name['href'])
        sci_names_ethno_dict['Link'].append(final_link) # add link of the corresponding plant to dict 
    except TypeError:
        pass

In [10]:
sci_names_ethno_df = pd.DataFrame(sci_names_ethno_dict)
sci_names_ethno_df.head()

TypeError: Cannot convert numpy.ndarray to numpy.ndarray

In [None]:
sci_names_ethno_df.to_csv("OSADHI_Ethno_Scientific_Name_Links.csv", index=False)

# Scraping each plant's specific details

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from tqdm import tqdm
import json

In [None]:
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

In [None]:
all_plant_info_dict = {}

def get_plant_website_html(plant_link):
    plant_webpage = requests.get(plant_link, headers=HEADERS)
    plant_soup = BeautifulSoup(plant_webpage.content, "html.parser")
    return plant_soup

def add_per_plant_details(soup, per_plant_details_dict):
    per_plant_details_dict = {
        'Summary': [],
        'Statewise availability': [],
        'Phytochemicals': [],
        'Ailments cured': [],
        'Plant parts and method of its use': [],
        'Vernacular name': []
        }
    
    for i in range(0, 6):
        if i == 0:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Summary'].append(j.text)
        if i == 1:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Statewise availability'].append(j.text)
        if i == 2:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Phytochemicals'].append(j.text)
        if i == 3:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Ailments cured'].append(j.text)
        if i == 4:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Plant parts and method of its use'].append(j.text)
        if i == 5:
            for j in soup.table.find_all('ul')[i]:
                per_plant_details_dict['Vernacular name'].append(j.text)

    return per_plant_details_dict

In [None]:
df = pd.read_csv("OSADHI_Ethno_Scientific_Name_Links.csv")
df.head(3)

In [None]:
loop_range = range(1494, len(df['Scientific Name']))
big_pause = 0

for i in tqdm((loop_range), desc="Processing"):
    plant_name = df['Scientific Name'][i]
    plant_link = df['Link'][i]

    plant_website_html = get_plant_website_html(plant_link) # getting HTML of plant details website
    per_plant_details = add_per_plant_details(plant_website_html, all_plant_info_dict) # collecting plant info to a dict
    all_plant_info_dict[plant_name] = per_plant_details # adding collected plant info to a main dict with plant sci name as key
    
    big_pause += 1
    if big_pause % 50 == 0:
        time.sleep(2) # after continuous 50 runs, long break for server

In [None]:
with open("OSADHI_all_plant_details_json_file_NEW.json", "w") as FinalOut: 
    json.dump(all_plant_info_dict, FinalOut)

# Merging JSONs

In [None]:
with open('OSADHI_all_plant_details_json_file_1.json') as f:
    data1 = json.load(f)
    
with open('OSADHI_all_plant_details_json_file_NEW.json') as f: 
    data2 = json.load(f)

data1.update(data2)

with open('OSADHI_all_plant_details_json_file.json', 'w') as f:
   json.dump(data1, f)

In [None]:
with open('OSADHI_all_plant_details_json_file.json') as f:
    item_dict = json.load(f)

len(item_dict)

In [None]:
sci_name_lst = df['Scientific Name'].tolist()

In [None]:
for key, _ in item_dict.items():
    try:
        sci_name_lst.remove(key)
    except ValueError:
        pass

In [None]:
sci_name_lst

In [None]:
item_dict

# Converting JSON to CSV

In [1]:
import json
import pandas as pd

In [2]:
with open('OSADHI_all_plants.json') as f:
    item_dict = json.load(f)

In [3]:
# item_dict's SUMMARY column DataFrame (df) 

first_entry = list(item_dict.values())[0]
headers = [item.split(':')[0] for item in first_entry['Summary']]  # use first entry to get headers
# headers expected output ['Scientific Name', 'Genus', 'Species', 'Family', 'Synonym']

rows = []

for plant_name, plant_data in item_dict.items():
    summary = plant_data.get('Summary', []) # extracting Summary values
    row = {}
    for idx, header in enumerate(headers):
        # Match summary items with corresponding headers
        row[header] = summary[idx].split(':')[1] if idx < len(summary) else ''

    rows.append(row)

df = pd.DataFrame(rows, columns=headers)
df.head()

Unnamed: 0,Scientific Name,Genus,Species,Family,Synonym
0,Abelmoschus crinitus,Abelmoschus,crinitus,Malvaceae,
1,Abelmoschus esculentus,Abelmoschus,esculentus,Malvaceae,
2,Abelmoschus ficulneus,Abelmoschus,ficulneus,Malvaceae,"Hibiscus ficulneus, Abelmoschus ficulneus,etc."
3,Abelmoschus manihot,Abelmoschus,manihot,Malvaceae,
4,Abelmoschus moschatus,Abelmoschus,moschatus,Malvaceae,


In [4]:
# item_dict's remaining column DataFrame (df1)

df1 = pd.DataFrame.from_dict(item_dict, orient='index')
df1 = df1.map(lambda x: ', '.join(x) if isinstance(x, list) else x)
df1.drop('Summary', axis=1, inplace=True)
df1.head()

Unnamed: 0,Statewise availability,Phytochemicals,Ailments cured,Plant parts and method of its use,Vernacular name
Abelmoschus crinitus,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...",,"Dysentery,","Whole Plant\r\n : Extract Taken Orally,",
Abelmoschus esculentus,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","4-O-BETA-D-GALACTOPYRANOSYL-D-GALACTOSE, 9-HEX...","Antispasmodic, Demulcent\r\n, Diaphoretic\r\n,...","Leaves\r\n : Cooked Taken Orally,","Andhra Pradesh : Benda\r, Karnataka : Bende\r,..."
Abelmoschus ficulneus,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...","Malvalic acid, Sterculic acid, Epoxyoleic acid,","Asthma\r\n, Wound Healing\r\n, Wound And Injur...","Whole Plant\r\n : Extract Taken Orally, Seeds\...","Assam : Dhopattita\r, Maharashtra : etari\r, T..."
Abelmoschus manihot,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","PROTEIN, GLYCOLIPIDS,","Cardioprotective\r\n,","Leaves\r\n : Cooked Taken Orally,",
Abelmoschus moschatus,"Andhra Pradesh, Chattisgarh, Jharkhand, Karnat...","PROTEIN, MALVALIC-ACID, STERCULIC-ACID, TRANS-...","Oxidant\r\n,","Leaves\r\n : Extract Taken Orally, Roots\r\n :...","Jharkhand : Musk okra\r, Karnataka : Latha\r, ..."


In [7]:
# merging both dfs, df and df1

merged_df = pd.concat([df.reset_index(drop=True), df1.reset_index(drop=True)], axis=1)
merged_df.head()

Unnamed: 0,Scientific Name,Genus,Species,Family,Synonym,Statewise availability,Phytochemicals,Ailments cured,Plant parts and method of its use,Vernacular name
0,Abelmoschus crinitus,Abelmoschus,crinitus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Maharastra, Odish...",,"Dysentery,","Whole Plant\r\n : Extract Taken Orally,",
1,Abelmoschus esculentus,Abelmoschus,esculentus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","4-O-BETA-D-GALACTOPYRANOSYL-D-GALACTOSE, 9-HEX...","Antispasmodic, Demulcent\r\n, Diaphoretic\r\n,...","Leaves\r\n : Cooked Taken Orally,","Andhra Pradesh : Benda\r, Karnataka : Bende\r,..."
2,Abelmoschus ficulneus,Abelmoschus,ficulneus,Malvaceae,"Hibiscus ficulneus, Abelmoschus ficulneus,etc.","Andhra Pradesh, Chattisgarh, Maharastra, Odish...","Malvalic acid, Sterculic acid, Epoxyoleic acid,","Asthma\r\n, Wound Healing\r\n, Wound And Injur...","Whole Plant\r\n : Extract Taken Orally, Seeds\...","Assam : Dhopattita\r, Maharashtra : etari\r, T..."
3,Abelmoschus manihot,Abelmoschus,manihot,Malvaceae,,"Andhra Pradesh, Chattisgarh, Karnataka, Kerala...","PROTEIN, GLYCOLIPIDS,","Cardioprotective\r\n,","Leaves\r\n : Cooked Taken Orally,",
4,Abelmoschus moschatus,Abelmoschus,moschatus,Malvaceae,,"Andhra Pradesh, Chattisgarh, Jharkhand, Karnat...","PROTEIN, MALVALIC-ACID, STERCULIC-ACID, TRANS-...","Oxidant\r\n,","Leaves\r\n : Extract Taken Orally, Roots\r\n :...","Jharkhand : Musk okra\r, Karnataka : Latha\r, ..."


In [13]:
merged_df.to_csv("OSADHI_ethno_all_plant_details.csv", index=False)