## scraping

In [1]:
from constanst import get_ISA_raw_HTML

In [2]:
html_doc=get_ISA_raw_HTML()

## get data

In [3]:
from urllib.parse import urljoin
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc, 'html.parser')

# Find all <td> tags with class="actions"
action_tds = soup.find_all('td', class_='actions')

# Extract links from the <a> tags within the <td> tags
base_url = 'https://demoplants21.best-research.eu'
links = [urljoin(base_url, td.find('a')['href']) for td in action_tds]

print(links)

['https://demoplants21.best-research.eu/projects/info/3641/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3726/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3717/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3886/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3717/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3774/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3844/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3061/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3701/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3970/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3969/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3065/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3904/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3405/8JBaZy', 'https://demoplants21.best-research.eu/projects/info/3256/8JB

In [4]:
len(links)

750

## Extract info from Links

In [7]:
import requests
from bs4 import BeautifulSoup
import json

def extract_table_data(url, count):
    # Send a GET request to the URL
    print(f"urls {count} : {url}" )
    response = requests.get(url)
    base_url = "https://demoplants21.best-research.eu"
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div with class "modal-body" and id "viewproject_content"
    div_content = soup.find('div', class_='modal-body', id='viewproject_content')

    # Create a dictionary to store the table data
    data = {}

    # Extract project information from the second <div class="well">
    project_info = {}
    project_div = div_content.find_all('div', class_='well')[1]
    project_table = project_div.find('table')
    if project_table:
        for row in project_table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) == 2:
                key = cells[0].text.strip()
                value = cells[1].text.strip()
                project_info[key] = value

    # Group project information under "Project_Info" key
    data["Project_Info"] = project_info

    # Manually assign legends
    legends = ['Geodata', 'Production', 'Additional Information', 'Contact Information', 'Files']

    # Loop through each legend and find its associated table
    for legend in legends:
        legend_element = div_content.find('legend', string=legend)
        if legend_element:
            # Find the next table sibling of the legend
            table = legend_element.find_next_sibling('div').find('table')

            # Extract table rows
            rows = {}
            if legend not in ['Contact Information', 'Files']:
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    # Use the first field as the key and the second field as the value
                    if len(cells) == 2:
                        key = cells[0].text.strip()
                        value = cells[1].text.strip()
                        rows[key] = value
            elif legend == 'Contact Information':
                # Treat contact information as a single key-value pair
                contact_info = table.find('td').text.strip()
                data[legend] = contact_info
            elif legend == 'Files':
                # Extract the filename from the "Files" section
                try:
                    file_name = table.find('img')['src']
                    data[legend] = f'{base_url}{file_name}'
                except TypeError:
                    data[legend] = "No file available"

            # Add the header and table data to the dictionary
            if legend not in ['Contact Information', 'Files']:
                data[legend] = rows

    return data

all_data = {}
count = 1
test_links = ['https://demoplants21.best-research.eu/projects/info/3701/8JBaZy']
for link in links:
    url_data = extract_table_data(link, count)
    all_data[link] = url_data
    count += 1
    if(count==6):
        break

# Save all data to a file
with open("test_data.json", "w") as file:
    json.dump(all_data, file, indent=4)

print("All data saved to test_data.json")


urls 1 : https://demoplants21.best-research.eu/projects/info/3641/8JBaZy
urls 2 : https://demoplants21.best-research.eu/projects/info/3726/8JBaZy
urls 3 : https://demoplants21.best-research.eu/projects/info/3717/8JBaZy
urls 4 : https://demoplants21.best-research.eu/projects/info/3886/8JBaZy
urls 5 : https://demoplants21.best-research.eu/projects/info/3717/8JBaZy
All data saved to test_data.json


## TO TABLE

In [21]:
import json

# Load JSON data from a file in a specific directory
with open('test_data.json', 'r') as json_file:
    data = json.load(json_file)

# Now 'data' contains the JSON data loaded as a Python dictionary
print(data)

{'https://demoplants21.best-research.eu/projects/info/3641/8JBaZy': {'Project_Info': {'Project Owner...': 'Aanevoima Oy', 'Project name': 'Aanekoski power plant', 'Status': 'operational'}, 'Geodata': {'Country': 'Finland', 'City': 'Äänekoski'}, 'Production': {'Type': 'TRL 9 Commercial', 'Technology': 'Bubbling Fluidized Bed', 'Raw Material': 'biomass / biomass coal blends', 'Input 1': 'biomass', 'Input 2': 'peat, sludge, HFO', 'Output 1': 'power (electricity) (38 MWel )', 'Output 2': 'heat (230 MWth)'}, 'Additional Information': {'Technology Brief': '3 boilers; BFB and 2 oil burners, 173 MW fuel input', 'Additional Information': 'http://www.ieabcc.nl/database/info/cofiring/info.php?id=195'}, 'Contact Information': 'n.a.'}, 'https://demoplants21.best-research.eu/projects/info/3726/8JBaZy': {'Project_Info': {'Project Owner...': 'Aarhus University', 'Project name': 'Center for Biorefining Technologies', 'Status': 'operational', 'Startup': '2015'}, 'Geodata': {'Country': 'Denmark', 'City':

In [22]:
import pandas as pd

df = pd.DataFrame.from_dict(data, orient='index')

In [23]:
df

Unnamed: 0,Project_Info,Geodata,Production,Additional Information,Contact Information
https://demoplants21.best-research.eu/projects/info/3641/8JBaZy,"{'Project Owner...': 'Aanevoima Oy', 'Project ...","{'Country': 'Finland', 'City': 'Äänekoski'}","{'Type': 'TRL 9 Commercial', 'Technology': 'Bu...",{'Technology Brief': '3 boilers; BFB and 2 oil...,n.a.
https://demoplants21.best-research.eu/projects/info/3726/8JBaZy,"{'Project Owner...': 'Aarhus University', 'Pro...","{'Country': 'Denmark', 'City': 'Foulum'}","{'Type': 'TRL 4-5 Pilot', 'Technology': 'PVC3:...",,Patrick Biller \r\npbiller@eng.au.dk
https://demoplants21.best-research.eu/projects/info/3717/8JBaZy,{'Project Owner...': 'Advanced Biofuels Soluti...,"{'Country': 'United Kingdom', 'City': 'Swindon'}","{'Type': 'TRL 8 First-of-a-kind commercial', '...","{'Total Investment': 'GBP 30,000,000', 'Fundin...","Tel: 01793 832 860\r\nUnit A4, Marston Gate\r\..."
https://demoplants21.best-research.eu/projects/info/3886/8JBaZy,{'Project Owner...': 'Advanced Biofuels Soluti...,"{'Country': 'United Kingdom', 'City': 'Swindon'}","{'Type': 'TRL 6-7 Demonstration', 'Technology'...",{'Technology Brief': 'Follow on from GoGreenGa...,Andy Cornell info@absl.tech +44 1793 832860


In [24]:

# Define a function to extract values for 'Country', 'City', 'ZIP', and 'State'
def extract_geodata(geodata):
    try:
        country = geodata.get('Country', None)
        city = geodata.get('City', None)
        zip_code = geodata.get('ZIP', None)
        state = geodata.get('State', None)
        return pd.Series([country, city, zip_code, state])
    except AttributeError:
        # Handle cases where 'Geodata' is not a dictionary
        return pd.Series([None, None, None, None])

# Apply the function to the 'Geodata' column
df[['Country', 'City', 'ZIP', 'State']] = df['Geodata'].apply(extract_geodata)

df


Unnamed: 0,Project_Info,Geodata,Production,Additional Information,Contact Information,Country,City,ZIP,State
https://demoplants21.best-research.eu/projects/info/3641/8JBaZy,"{'Project Owner...': 'Aanevoima Oy', 'Project ...","{'Country': 'Finland', 'City': 'Äänekoski'}","{'Type': 'TRL 9 Commercial', 'Technology': 'Bu...",{'Technology Brief': '3 boilers; BFB and 2 oil...,n.a.,Finland,Äänekoski,,
https://demoplants21.best-research.eu/projects/info/3726/8JBaZy,"{'Project Owner...': 'Aarhus University', 'Pro...","{'Country': 'Denmark', 'City': 'Foulum'}","{'Type': 'TRL 4-5 Pilot', 'Technology': 'PVC3:...",,Patrick Biller \r\npbiller@eng.au.dk,Denmark,Foulum,,
https://demoplants21.best-research.eu/projects/info/3717/8JBaZy,{'Project Owner...': 'Advanced Biofuels Soluti...,"{'Country': 'United Kingdom', 'City': 'Swindon'}","{'Type': 'TRL 8 First-of-a-kind commercial', '...","{'Total Investment': 'GBP 30,000,000', 'Fundin...","Tel: 01793 832 860\r\nUnit A4, Marston Gate\r\...",United Kingdom,Swindon,,
https://demoplants21.best-research.eu/projects/info/3886/8JBaZy,{'Project Owner...': 'Advanced Biofuels Soluti...,"{'Country': 'United Kingdom', 'City': 'Swindon'}","{'Type': 'TRL 6-7 Demonstration', 'Technology'...",{'Technology Brief': 'Follow on from GoGreenGa...,Andy Cornell info@absl.tech +44 1793 832860,United Kingdom,Swindon,,


In [25]:
df['Project_Info'][3]

  df['Project_Info'][3]


{'Project Owner...': 'Advanced Biofuels Solutions Ltd',
 'Project name': 'ABSL bio-SNG demonstrator',
 'Status': 'under construction',
 'Startup': '2020'}