In [12]:
import time
import requests
import pandas as pd
from tqdm import tqdm

from bs4 import BeautifulSoup
from selenium import webdriver

browser = webdriver.Chrome()

browser.get('https://stockmages.netlify.app')

### 1. Auto scroll to reach the bottom of the Page

In [13]:
def auto_scroll(browser, start, end, step, delay):
    for i in tqdm(range(start, end, step)):
        browser.execute_script("window.scrollTo(0," + str(i) + ")")
        time.sleep(delay)

In [14]:
auto_scroll(browser, 0, 3000000, 1000, 0.1)

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [07:11<00:00,  6.95it/s]


### 2. Scraping the Page Course

In [15]:
soup = BeautifulSoup(browser.page_source,'html.parser')

### 3. Checking number of images that has to be scraped

In [16]:
len(soup.find_all('div', class_ = 'container'))

9104

### 4. Scraping the Image Details

In [17]:
data = []

for sp in tqdm(soup.find_all('div', class_ = 'container')):

    img_link = sp.find('img').get('src')
    tags     = sp.find('span', class_ = 'tag-color').text[7:].strip()
    likes    = int(sp.find('div', class_ = 'likes-comments').find_all('span')[0].text.strip()[:-6])
    comments = int(sp.find('div', class_ = 'likes-comments').find_all('span')[1].text.strip()[:-9])
    
    data.append([img_link, tags, likes, comments])

100%|████████████████████████████████████████████████████████████████████████████| 9104/9104 [00:01<00:00, 4699.68it/s]


### 5. Saving data in a DataFrame

In [18]:
df = pd.DataFrame(data, columns = ['img_link', 'tags', 'likes', 'comments'])

df.head()

Unnamed: 0,img_link,tags,likes,comments
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15


### 6. Checking Null Values

In [19]:
df.isnull().sum()

img_link    0
tags        0
likes       0
comments    0
dtype: int64

### 7. Checking Duplicate Values

In [20]:
df['img_link'].describe()

count                                                  9104
unique                                                 9088
top       https://cdn.pixabay.com/photo/2022/03/06/05/30...
freq                                                      2
Name: img_link, dtype: object

### 8. Removing Duplicate Values

In [21]:
df = df.drop_duplicates()

### 9. Saving the DataFrame as CSV

In [22]:
df.to_csv('images.csv', index = False)

### 10. Closing the browser

In [23]:
browser.quit()