Animal Web Scraper 2

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time 
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import pandas as pd


In [34]:
# Initialize lists to hold the data
animal_names = []
animal_types = []
animal_descriptions = []
animal_population_threats = {}
animal_population_stats = {}
animal_attributes = {}
animal_distributions = {}
distributions_p_list = []

# Path to your chromedriver executable
service = Service(executable_path="chromedriver.exe")
driver = webdriver.Chrome(service=service)

# Open the webpage
driver.get("https://animalia.bio/endemic-lists/country/endemic-animals-of-philippines")

# Wait for the link element to be present and clickable
link_element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//div[@class='rgt-block']//a"))
)
initial_url = driver.current_url
link_element.click()

time.sleep(10)

WebDriverWait(driver, 10).until(
    EC.url_changes(initial_url)
)

new_url = driver.current_url
print(new_url)  # will print the URL

if new_url != initial_url:
    try:
        input_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[@class='breadcrumb-link breadcrumb-active']"))
        )  # this is for animal name
        # this is for the animal type
        heading = driver.find_element(By.XPATH, "//p[@class='s-char-heading__name']")
        attributes = driver.find_elements(By.XPATH, "//div[@class='s-char-kinds__attr']")
        names = driver.find_elements(By.XPATH, "//a[@class='s-char-kinds__name']")
        description = driver.find_element(By.XPATH, "//div[@class='s-char-text']//a[@class='show-more read-more-show']")
        description.click()
        time.sleep(3)
        description = driver.find_element(By.XPATH, "//div[@class='s-char-text']")
        # distribution
        distributions_cat = driver.find_elements(By.XPATH, "//div[@class='s-distr-content']//div[@class='s-distr-geography__slug']")
        distributions_name = driver.find_elements(By.XPATH, "//div[@class='s-distr-content']//a[@class='s-distr-geography__link ']")
        distributions_p = driver.find_element(By.XPATH, "//div[@class='s-distr-content']//p")
        # population
        popu_threats = driver.find_elements(By.XPATH, "//div[@class='s-population-content']//h3[@class='a-h3']")
        popu_p = driver.find_elements(By.XPATH, "//div[@class='s-population-content']//p")
        popu_trend = driver.find_elements(By.XPATH, "//div[@class='s-population-link']//div//div//div")
        popu_stats = driver.find_elements(By.XPATH, "//div[@class='s-population-link']//a")
        
        # Append data to lists
        animal_names.append(input_element.text)
        animal_types.append(heading.text)
        animal_descriptions.append(description.text)
        # animal_population_threats.append("; ".join([threat.text + " - " + para.text for threat, para in zip(popu_threats, popu_p)]))
        # animal_population_stats.append("; ".join([trends.text + " - " + stats.text for trends, stats in zip(popu_trend, popu_stats)]))
        distributions_p_list.append(distributions_p.text)
        
        for attr, name in zip(attributes, names):
            if attr.text not in animal_attributes:
                animal_attributes[attr.text] = [""] * (len(animal_names) - 1)  # Initialize with empty strings
            animal_attributes[attr.text].append(name.text)
        
        for cat, name in zip(distributions_cat, distributions_name):
            if cat.text not in animal_distributions:
                animal_distributions[cat.text] = [""] * (len(animal_names) - 1)
            animal_distributions[cat.text].append(name.text)
            
        for threat, para in zip(popu_threats, popu_p):
            if threat.text not in animal_population_threats:
                animal_population_threats[threat.text] = [""] * (len(animal_names) - 1)
            animal_population_threats[threat.text].append(para.text)
            
        for trends, stats in zip(popu_trend, popu_stats):
            if trends.text not in animal_population_stats:
                animal_population_stats[trends.text] = [""] * (len(animal_names) - 1)
            animal_population_stats[trends.text].append(stats.text)
            
    except Exception as e:
        print(f"Element not found: {e}")

# Ensure all lists have the same length
max_len = max(len(animal_names), len(animal_types), len(animal_descriptions))

while len(animal_names) < max_len:
    animal_names.append("")
while len(animal_types) < max_len:
    animal_types.append("")
while len(animal_descriptions) < max_len:
    animal_descriptions.append("")
# while len(animal_population_threats) < max_len:
#     animal_population_threats.append("")
# while len(animal_population_stats) < max_len:
#     animal_population_stats.append("")
while len(distributions_p_list) < max_len:
    distributions_p_list.append("")

# Ensure animal_attributes have the same length
for key in animal_attributes:
    while len(animal_attributes[key]) < max_len:
        animal_attributes[key].append("")

# Ensure animal_distributions have the same length
for key in animal_distributions:
    while len(animal_distributions[key]) < max_len:
        animal_distributions[key].append("")
        
for key in animal_population_threats:
    while len(animal_population_threats[key]) < max_len:
        animal_population_threats[key].append("")

for key in animal_population_stats:
    while len(animal_population_stats[key]) < max_len:
        animal_population_stats[key].append("")

# Print lengths of all lists for debugging
print(f"Length of animal_names: {len(animal_names)}")
print(f"Length of animal_types: {len(animal_types)}")
print(f"Length of animal_descriptions: {len(animal_descriptions)}")
# print(f"Length of animal_population_threats: {len(animal_population_threats)}")
# print(f"Length of animal_population_stats: {len(animal_population_stats)}")
print(f"Length of distributions_p_list: {len(distributions_p_list)}")
for key, value in animal_attributes.items():
    print(f"Length of {key}: {len(value)}")
for key, value in animal_distributions.items():
    print(f"Length of {key}: {len(value)}")
for key, value in animal_population_threats.items():
    print(f"Length of {key}: {len(value)}")
for key, value in animal_population_stats.items():
    print(f"Length of {key}: {len(value)}")

# Create a DataFrame
data = {
    "Animal Name": animal_names,
    "Animal Type": animal_types,
    "Description": animal_descriptions,
    # "Population Threats": animal_population_threats,
    # "Population Stats": animal_population_stats
}

# Add dynamic attribute columns
data.update(animal_attributes)
data.update(animal_distributions)
data["Distribution Info"] = distributions_p_list
data.update(animal_population_threats)
data.update(animal_population_stats)

df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv("animal_data.csv", index=False)

# Close the browser
driver.quit()


https://www.animalia.bio/philippine-flying-lemur?endemic=14
Length of animal_names: 1
Length of animal_types: 1
Length of animal_descriptions: 1
Length of distributions_p_list: 1
Length of KINGDOM: 1
Length of PHYLUM: 1
Length of SUBPHYLUM: 1
Length of CLASS: 1
Length of ORDER: 1
Length of FAMILY: 1
Length of GENUS: 1
Length of CONTINENTS: 1
Length of SUBCONTINENTS: 1
Length of COUNTRIES: 1
Length of BIOGEOGRAPHICAL REALMS: 1
Length of WWF BIOMES: 1
Length of Population threats: 1
Length of Population number: 1
Length of Ecological niche: 1
Length of POPULATION TREND: 1
Length of POPULATION STATUS: 1


In [None]:
# #print
        # print(input_element.text +" - "+ heading.text)#print the animal name
        # # Append data to lists
        # animal_names.append(input_element.text)
        # animal_types.append(heading.text)
        # #print the animal attributes 
        # for attr, name in zip(attributes, names):
        #     print(attr.text+" - "+name.text)
        #     animal_attributes.append(attr.text + " - " + name.text)
        # #print the animal description
        # print(description.text)
        # animal_descriptions.append(description.text)
        # #print the animal destribution
        # for cat, name in zip(destributions_cat, destributions_name):
        #     print(cat.text +" - "+ name.text)
        #     animal_distributions.append(cat.text + " - " + name.text)
        # print(destributions_p.text)
        # animal_distributions.append(destributions_p.text)
        # #population
        # for threat, para in zip(popu_threats, popu_p):
        #     print(threat.text +" - "+ para.text)
        #     animal_population_threats.append(threat.text + " - " + para.text)
        # for trends, stats in zip(popu_trend, popu_stats):
        #     print(trends.text +" - "+ stats.text)
        #     animal_population_stats.append(trends.text + " - " + stats.text)