## Scraping using Selenium

*Prepared by:*
**Chelsea Fernandez**  
DATA102 Student

### Installation of Dependencies

In [5]:
%pip install selenium
%pip install webdriver_manager
%pip install --upgrade webdriver_manager

Note: you may need to restart the kernel to use updated packages.


### Selenium: Setup browser automation

In [27]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

In [35]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
url = "https://www.un-fancy.com/daily-outfits/the-34-dress-youll-wear-all-summer/"
driver.get(url)
html_content = driver.page_source
print(html_content)

<html lang="en-US"><head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>The $34 dress you’ll wear all summer</title>
<meta name="robots" content="max-image-preview:large">
<link rel="dns-prefetch" href="//stats.wp.com">
<link rel="dns-prefetch" href="//maxcdn.bootstrapcdn.com">
<link rel="dns-prefetch" href="//fonts.googleapis.com">
<link rel="dns-prefetch" href="//i0.wp.com">
<link rel="alternate" type="application/rss+xml" title="Un-Fancy » Feed" href="https://www.un-fancy.com/feed/">
<link rel="alternate" type="application/rss+xml" title="Un-Fancy » Comments Feed" href="https://www.un-fancy.com/comments/feed/">
<link rel="alternate" type="application/rss+xml" title="Un-Fancy » The $34 dress you’ll wear all summer Comments Feed" href="https://www.un-fancy.com/daily-outfits/the-34-dress-youll-wear-all-summer/feed/">
<link rel="canonical" href="https://www.un-fancy.com/daily-outfits/the-34-dress-youll-wear-all-summer/">
<script type=

### Getting the Images

In [37]:
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

#### Bigger Images using Selector

In [40]:
# Use the CSS selector to find the specific <p> element
p_element = soup.select_one('body > div.site-inner > div > div > main > article > div.entry-content > p:nth-child(1)')

if p_element:
    # Find all <img> tags within the selected <p> element
    images = p_element.find_all('img')
    # Extract src attribute from each <img> tag
    image_sources = [img['src'] for img in images]
    # Print the image sources
    for source in image_sources:
        print(source)
else:
    print("Element not found.")

https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-216.jpg?resize=1080%2C1440
https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-221.jpg?resize=1080%2C1440
https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-220.jpg?resize=1080%2C1440
https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-219.jpg?resize=1080%2C1440
https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-218.jpg?resize=1080%2C1440
https://i0.wp.com/www.un-fancy.com/wp-content/uploads/2024/04/spring-2024-capsule-wardrobe-outfit-ideas-217.jpg?resize=1080%2C1440


#### Smaller Images (In Carousel Gallery) by determining a specific class

In [33]:
# Find the element with class="entry-content"
container = soup.find(class_="shopthepost-widget")

# Find all img tags within the entry-content
images = container.find_all('img')

# Extract src attribute from each img tag
image_sources = [img['src'] for img in images]

# Print the image sources
for source in image_sources:
    print(source)

https://product-images-cdn.liketoknow.it/dxZyQW2vXQEoAtLB6F0B1oGi_lrUG5vteZvHHNcmJjXq75oI.o.oa1LCp9MxMUIRZjb26yVRCLd_ENPmf5F.bz6.ma7p.YkWYWxB3zceEp8KDwxcwi4-?v=2&max&height=0&width=100
https://product-images-cdn.liketoknow.it/Ac8ZTr7NrwPOXRc_5PusQV0_gC7u_OiyW2EYPPRzdRmhWyCN.XOGlhsCQyYmxSvw11lcErSj_kf0nGY4xurIPSPb9bnXhs8ub0p3PefAjEfN42bWrAw-?v=2&max&height=0&width=100
https://product-images-cdn.liketoknow.it/bZrBEel1fZj_2nb4KxBQyuFkgYL5JqHy57ItSzyXEAcP9Wix3BwJR94PS9VfU.9LzzWM4avReX6Pbj1TNHcTW3HXXrHI9eHdYOa7zYDv3tVMg6GQ2ZaABdLAlDfix1Y3O6DHVl85wNQcjWhxjvnHpWZNMsAuA_UhR49H8UVAlVuDf3_qj4jd4EE1e.drqYCOS2gfoS6WUfTx2bw7IAZAcZfv7bRe.4I04QqN?v=2&max&height=0&width=100
https://product-images-cdn.liketoknow.it/vjNVimmdwD8V0tS7d7KD0G6SCidEvU8w_IwiQSq86pgkpBTnTPjnGxus8k7noO_edWuMY0nkOMAr8GZWiYQZf9sIQCX_C.kj22uMo.Ehu1Ze3mH0FmozHUDl26RJ7eMAUVvQJfulWmzs_JvnhyzPXwrojtEShf5eYGY3u0qRXUT3MAM9HWD5Lgwb_2yZ2wXbDFfr0Fo0cS9chiwEML_R5o3HnW9EofucF_9sVQ--?v=2&max&height=0&width=100
https://product-images-cdn.liket

In [41]:
# Close the webdriver
driver.quit()