In [None]:
import time
from selenium import webdriver
import re
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import requests
import os
import shutil

# Set up the Chrome driver (you need to have Chrome installed)
driver = webdriver.Chrome()

# List of URLs to process
urls = [
    "https://www.sciencedirect.com/science/article/pii/S026382310800013X?via%3Dihub",
    "https://www.sciencedirect.com/science/article/pii/S0141029617304996#t0025",
    "https://www.sciencedirect.com/science/article/pii/S026382311730215X#t0010",
    "https://www.sciencedirect.com/science/article/pii/S026382311930196X#tbl2",
    #"https://www.sciencedirect.com/science/article/pii/S0263823114003139#t0015"
]

# Directory to save the scraped files
directory = r"C:\Users\23109"

# Prefix to search for in file names
file_name_prefix = "Experiments on cold-formed steel columns with holes"

# Create the "scrape" folder if it doesn't exist
folder_path = "scrape"
os.makedirs(folder_path, exist_ok=True)

# Iterate over the URLs
cnt = 0
for url in urls:
    # Open the webpage
    driver.get(url)
    cnt += 1
    if cnt==1:
        # Wait for 20 seconds for manual login
        time.sleep(40)
    time.sleep(5)
    # Check if the button exists on the page
    button = driver.find_elements("xpath", "//span[contains(@class, 'button-link-text') and contains(text(), 'Show more figures')]")

    if button:
        # Click the button to load more figures
        button[0].click()
        time.sleep(5)  # Wait for the figures to load
    else:
        print("The 'Show more figures' button was not found.")

    # Retrieve the HTML content
    html_content = driver.page_source

    # Find all possible HTMLs with a similar format
    possible_htmls = re.findall(r'<div class="tables[^>]*>.*?</div>', html_content, flags=re.DOTALL)

    # Extract displayed text and save the tables as text and original HTML files
    for i, html_snippet in enumerate(possible_htmls):
        modified_html = html_snippet.replace('<span class="label">', '<span class="label" style="color: black;">')
        table_html = f"<div style='color:black;'>{modified_html}</div>"
        display(HTML(table_html))

        # Save the original HTML as a text file
        with open(f"table_{i+1}_original.html", "w", encoding="utf-8") as file:
            file.write(table_html)

        print(f"Table {i+1} saved as text and original HTML files.")

    # Download and save the images
    figure_links = re.findall(r'<img alt=".*?" class="u-display-block" height=".*?" src="(.*?)" width=".*?">', html_content)
    print('Number of figures:', len(figure_links))
    figure_links = [url.replace('.sml', '.jpg') for url in figure_links]

    for i, image_url in enumerate(figure_links):
        try:
            # Send a GET request to the image URL
            response = requests.get(image_url)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Get the file name from the URL
                file_name = f"{file_name_prefix}_{i+1}.jpg"

                # Save the image in the current working directory
                with open(file_name, "wb") as file:
                    file.write(response.content)

                print(f"Image {i+1} downloaded successfully!")
            else:
                print(f"Failed to download Image {i+1}.")
        except Exception as e:
            print(f"Error downloading Image {i+1}: {e}")

    # Move the files to the "scrape" folder
    for filename in os.listdir(directory):
        if filename.startswith(file_name_prefix):
            # Get the full path of the file
            file_path = os.path.join(directory, filename)

            # Move the file to the "scrape" folder
            destination_path = os.path.join(folder_path, filename)
            shutil.move(file_path, destination_path)

            print(f"File '{filename}' moved to 'scrape' folder.")

            
    import re
    from IPython.display import display, HTML
    from bs4 import BeautifulSoup
    import pandas as pd

    # Find all possible HTMLs with a similar format
    # Create an empty DataFrame to store the merged tables
    merged_table = pd.DataFrame()



    # Iterate over the tables
    first_column_names = []

    # Iterate over the tables
    for html_snippet in possible_htmls:
        modified_html = html_snippet.replace('<span class="label">', '<span class="label" style="color: black;">')
        table_html = f"<div style='color:black;'>{modified_html}</div>"

        # Convert the HTML table to a DataFrame using BeautifulSoup
        soup = BeautifulSoup(table_html, 'html.parser')
        table = soup.find('table')
        headers = table.findAll('th')

        # Get the name of the first column
        if headers:
            first_column_name = headers[0].text.strip()
            first_column_names.append(first_column_name)

    # Find the most common string in the first column names
    most_common_column_name = max(set(first_column_names), key=first_column_names.count)


    # Iterate over the tables
    for i, html_snippet in enumerate(possible_htmls):
        modified_html = html_snippet.replace('<span class="label">', '<span class="label" style="color: black;">')
        table_html = f"<div style='color:black;'>{modified_html}</div>"

        # Convert the HTML table to a DataFrame using BeautifulSoup
        soup = BeautifulSoup(table_html, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(str(table))[0]

        # Check if the first column contains the most_common_column_name
        #print(df.columns)
        if not any(most_common_column_name.lower() in str(col).lower() for col in df.columns):
            # Skip the table and report the issue
            print(f"Skipping Table {i+1} due to missing first column '{most_common_column_name}'")
            continue

        # Check if the DataFrame contains a column with the name containing "Specimen"
        specimen_columns = [col for col in df.columns if most_common_column_name in col]
        if len(specimen_columns) > 0:
            # Keep only one column with the name containing "Specimen"
            df = df.drop(specimen_columns[1:], axis=1)

        # Add a suffix to the columns to avoid overlap
        suffix = f"_{i+1}"
        df = df.add_suffix(suffix)

        # Merge the current table with the existing merged table
        if merged_table.empty:
            merged_table = df
        else:
            print(merged_table.shape)
            print(df.shape)
            #merged_table = merged_table.join(df, how='outer')
            merged_table.columns = merged_table.columns.get_level_values(0)
            df.columns = df.columns.get_level_values(0)
            merged_table = pd.merge(merged_table, df, left_index=True, right_index=True, how='outer')

        # Save the original HTML as a text file
        with open(f"table_{i+1}_original.html", "w", encoding="utf-8") as file:
            file.write(table_html)

        print(f"Table {i+1} saved as text and original HTML files.")

    # Get the column names dynamically that contain the most_common_column_name
    columns_to_merge = [col for col in merged_table.columns if most_common_column_name in col]

    # Merge the columns together and change the column name dynamically
    merged_table[most_common_column_name] = merged_table[columns_to_merge].fillna('').apply(lambda row: ''.join(row), axis=1)

    # Drop the redundant columns except for the merged column
    merged_table.drop(columns=columns_to_merge[1:], inplace=True)

    # Rename the column to "Specimen"
    merged_table1 = merged_table.rename(columns={most_common_column_name: most_common_column_name})
    '''
    # Iterate over each row of the DataFrame
    for index, row in merged_table.iterrows():
        mark_element = row[most_common_column_name]  # Get the element to mark from the 'Specimen' column

        # Check if the mark element is not NaN and is not of float type
        if not pd.isna(mark_element) and not isinstance(mark_element, float):
            # Iterate over each column of the row
            for column in merged_table.columns:
                # Check if the column contains the marked element (excluding the 'Specimen' column itself)
                if column != most_common_column_name and mark_element in np.array(row[column]):
                    merged_table = merged_table.drop(columns=column)  # Drop the column
    '''
    # Print the modified DataFrame
    print('')
    print('')
    print('Merged Table of Article ', cnt)
    display(merged_table1)
    print('')
    print('')
    filename = f"Merged Table of article {cnt}.csv"
    merged_table.to_csv(filename, index=False)
    print(f'file{cnt}_saved!')
    print('')
    print('')
