Animal Web Scraper 2

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time 
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException  # Import TimeoutException
import requests
import pandas as pd


In [2]:
# Initialize lists to hold the data
animal_names = []
animal_types = []
animal_descriptions = []
animal_population_threats = {}
animal_population_stats = {}
animal_attributes = {}
animal_distributions = {}
distributions_p_list = []

# Path to your chromedriver executable
service = Service(executable_path="chromedriver.exe")
driver = webdriver.Chrome(service=service)

# Read the CSV file containing links
data = pd.read_csv('AnimalLinkList.csv')
# max_scrapes = 5  # Limit the number of pages to scrape

for index, row in data.iterrows():
    # Check if max_scrapes limit has been reached
    # if index >= max_scrapes:
    #     break

    col1_val = row['Links to Scrape']
    print(f"Scraping: {col1_val}")

    # Try to load the page, with error handling for TimeoutException
    try:
        driver.get(col1_val)
    except TimeoutException:
        print(f"Failed to load page: {col1_val}")
        # Append empty strings to all fields and continue to the next link
        animal_names.append("")
        animal_types.append("")
        animal_descriptions.append("")
        distributions_p_list.append("")
        for key in animal_attributes:
            animal_attributes[key].append("")
        for key in animal_distributions:
            animal_distributions[key].append("")
        for key in animal_population_threats:
            animal_population_threats[key].append("")
        for key in animal_population_stats:
            animal_population_stats[key].append("")
        continue

    # Try to fetch and store each piece of information
    try:
        input_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//span[@class='breadcrumb-link breadcrumb-active']"))
        )  # Animal name
        animal_names.append(input_element.text)
    except (TimeoutException, NoSuchElementException):
        animal_names.append("")

    try:
        heading = driver.find_element(By.XPATH, "//p[@class='s-char-heading__name']")
        animal_types.append(heading.text)
    except NoSuchElementException:
        animal_types.append("")

    try:
        # Check if the "show more" button exists
        show_more_button = driver.find_elements(By.XPATH, "//div[@class='s-char-text']//a[@class='show-more read-more-show']")
        
        if show_more_button:
            # If the button exists, click it to reveal the full description
            show_more_button[0].click()
            time.sleep(3)  # Wait for the content to load after clicking "show more"
        
        # Extract the description (whether or not "show more" was clicked)
        description = driver.find_element(By.XPATH, "//div[@class='s-char-text']")
        animal_descriptions.append(description.text)
        print(description.text)
    except NoSuchElementException:
        animal_descriptions.append("")

    try:
        distributions_cat = driver.find_elements(By.XPATH, "//div[@class='s-distr-content']//div[@class='s-distr-geography__slug']")
        distributions_name = driver.find_elements(By.XPATH, "//div[@class='s-distr-content']//a[@class='s-distr-geography__link ']")
        distributions_p = driver.find_element(By.XPATH, "//div[@class='s-distr-content']//p")
        distributions_p_list.append(distributions_p.text)
    except NoSuchElementException:
        distributions_p_list.append("")

    try:
        attributes = driver.find_elements(By.XPATH, "//div[@class='s-char-kinds__attr']")
        names = driver.find_elements(By.XPATH, "//a[@class='s-char-kinds__name']")
        for attr, name in zip(attributes, names):
            if attr.text not in animal_attributes:
                animal_attributes[attr.text] = [""] * (len(animal_names) - 1)  # Initialize with empty strings
            animal_attributes[attr.text].append(name.text)
    except NoSuchElementException:
        for key in animal_attributes:
            animal_attributes[key].append("")

    try:
        for cat, name in zip(distributions_cat, distributions_name):
            if cat.text not in animal_distributions:
                animal_distributions[cat.text] = [""] * (len(animal_names) - 1)
            animal_distributions[cat.text].append(name.text)
    except NoSuchElementException:
        for key in animal_distributions:
            animal_distributions[key].append("")

    try:
        popu_threats = driver.find_elements(By.XPATH, "//div[@class='s-population-content']//h3[@class='a-h3']")
        popu_p = driver.find_elements(By.XPATH, "//div[@class='s-population-content']//p")
        popu_trend = driver.find_elements(By.XPATH, "//div[@class='s-population-link']//div//div//div")
        popu_stats = driver.find_elements(By.XPATH, "//div[@class='s-population-link']//a")
        for threat, para in zip(popu_threats, popu_p):
            if threat.text not in animal_population_threats:
                animal_population_threats[threat.text] = [""] * (len(animal_names) - 1)
            animal_population_threats[threat.text].append(para.text)
        for trends, stats in zip(popu_trend, popu_stats):
            if trends.text not in animal_population_stats:
                animal_population_stats[trends.text] = [""] * (len(animal_names) - 1)
            animal_population_stats[trends.text].append(stats.text)
    except NoSuchElementException:
        for key in animal_population_threats:
            animal_population_threats[key].append("")
        for key in animal_population_stats:
            animal_population_stats[key].append("")

# Ensure all lists have the same length
max_len = max(len(animal_names), len(animal_types), len(animal_descriptions))

while len(animal_names) < max_len:
    animal_names.append("")
while len(animal_types) < max_len:
    animal_types.append("")
while len(animal_descriptions) < max_len:
    animal_descriptions.append("")
while len(distributions_p_list) < max_len:
    distributions_p_list.append("")

for key in animal_attributes:
    while len(animal_attributes[key]) < max_len:
        animal_attributes[key].append("")

for key in animal_distributions:
    while len(animal_distributions[key]) < max_len:
        animal_distributions[key].append("")
        
for key in animal_population_threats:
    while len(animal_population_threats[key]) < max_len:
        animal_population_threats[key].append("")

for key in animal_population_stats:
    while len(animal_population_stats[key]) < max_len:
        animal_population_stats[key].append("")

# Create a DataFrame
data = {
    "Animal Name": animal_names,
    "Animal Type": animal_types,
    "Description": animal_descriptions,
}

# Add dynamic attribute columns
data.update(animal_attributes)
data.update(animal_distributions)
data["Distribution Info"] = distributions_p_list
data.update(animal_population_threats)
data.update(animal_population_stats)

df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv("animal1.csv", index=False)

# Close the browser
driver.quit()


#If now show more button it will just directly print the description, 
#Because the code can't get the description if the show more button is not present it will just directly just skip the description in result of this absence

Scraping: https://www.animalia.bio/philippine-flying-lemur?endemic=14
The Philippine flying lemur or Philippine colugo (Cynocephalus volans ), known locally as kagwang, is one of two species of colugo or "flying lemurs." It is monotypic of its genus. Although called a flying lemur, it cannot fly and is not a lemur. Instead, it glides as it leaps among trees.
The kagwang belongs to the order Dermoptera that contains only two species, one of which is found in the Philippines, while the other, the Sunda flying lemur, is found in Indonesia, Thailand, Malaysia, and Singapore. Recent research from genetic analysis suggests two other species, the Bornean flying lemur and the Javan flying lemur, may exist, as well, but they have yet to be officially classified.
Both species of Dermoptera are classified under the grandorder Euarchonta, which includes treeshrews and primates, as well as an extinct order of mammals, the Plesiadapiformes.
SHOW LESS
Scraping: https://www.animalia.bio/philippine-tar

In [3]:
# #print
        # print(input_element.text +" - "+ heading.text)#print the animal name
        # # Append data to lists
        # animal_names.append(input_element.text)
        # animal_types.append(heading.text)
        # #print the animal attributes 
        # for attr, name in zip(attributes, names):
        #     print(attr.text+" - "+name.text)
        #     animal_attributes.append(attr.text + " - " + name.text)
        # #print the animal description
        # print(description.text)
        # animal_descriptions.append(description.text)
        # #print the animal destribution
        # for cat, name in zip(destributions_cat, destributions_name):
        #     print(cat.text +" - "+ name.text)
        #     animal_distributions.append(cat.text + " - " + name.text)
        # print(destributions_p.text)
        # animal_distributions.append(destributions_p.text)
        # #population
        # for threat, para in zip(popu_threats, popu_p):
        #     print(threat.text +" - "+ para.text)
        #     animal_population_threats.append(threat.text + " - " + para.text)
        # for trends, stats in zip(popu_trend, popu_stats):
        #     print(trends.text +" - "+ stats.text)
        #     animal_population_stats.append(trends.text + " - " + stats.text)