# **All the necessary Imports**

The purpose of this notebook was to download images from a Website for Academic purposes.

In [None]:
from bs4 import BeautifulSoup

import requests 
import urllib.request
import urllib
import json
import pandas as pd
import time
import unidecode as ud
import os
import pandas as pd

# **Downloading Script Function**

In [None]:
def download_imgs(styles, dataset_path, end_range, sub_url):

  image_no = 0
  metadata_imgs = []

  for x in range(1, end_range):
    print(f'Loop: {x}')
    pagination_value = str(x) 
    # Link from XHR Network Tab
    url = sub_url + pagination_value +'&resultType=masonry'

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    soup_json = json.loads(str(soup))
    dict_json = dict(soup_json)

    painting_info = [paintings for keys, values in dict_json.items() if keys == 'Paintings' for paintings in values]

    for p_index in range(len(painting_info)): #
        img_id = painting_info[p_index]['id']
        title = painting_info[p_index]['title'].replace('"', '')
        year = painting_info[p_index]['year']
        artist_name = painting_info[p_index]['artistName'].replace('"', '')
        link = painting_info[p_index]['image']
        
        scheme, netloc, path, query, fragment = urllib.parse.urlsplit(link)
        path = urllib.parse.quote(path)
        link = urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
        
        # Convert to utf8 formating 
        decoded_title = ud.unidecode(title).replace("/", "_")
        # title = decoded_title+".jpg"
        title = decoded_title
        
        # Style name 
        image_no = image_no + 1
        img_name = title+'_'+ styles +'_'+ artist_name +'_'+ str(image_no) +'.jpg'
        
        # Save metadata in a json format
        img_metadata = {
            "id" : img_id,
            "title" : title,
            "year" : year,
            "artist_name": artist_name,
            "image" : link,
            "style" : styles,
            "reference_name": img_name
        }
        
        
        full_name = os.path.join(dataset_path, img_name) # Repository to save the imags 


        # Keep track of the images that are downloaded so that there are no duplicates
        if os.path.isfile(full_name):
            print('______________________________________________________________________')
            print(f'{img_name} is already Saved')
            metadata_imgs.append(img_metadata)
            print('______________________________________________________________________')
        else:
            print(f'{img_name} : {decoded_title} to save!')
            urllib.request.urlretrieve(link, full_name)
            metadata_imgs.append(img_metadata)
            print(f'{img_name} : {decoded_title} Saved!')
            print('-------------------------------------')
        
    if x == end_range: 
      print('Downloading done') # indicate if we are done downlading
    
    # Prevent the server from blocking the IP (About 100K Images to be downloaded)
    time.sleep(5)


    return metadata_imgs

# **Testing using WikiArt API**

Please make note that WikiARt has a dedicted read-only json API.
For more Information on the API follow this Link: https://www.wikiart.org/en/App/GetApi

In [None]:
# Dealing with pagination of the website

first_link = 'https://www.wikiart.org/en/paintings-by-style/'
second_link = '?select=featured&json=2&layout=new&page='

# **Baroque**

Download Baroque Dataset

In [None]:
styles = 'Baroque' 
dataset_path = '/content/drive/My Drive/Model_Train/Art_Dataset/'

end_range = 5 # This is the number of pages (For the pagination Value)

style_name = 'baroque'
sub_url = first_link + style_name + second_link 

action_dict = download_imgs(styles, dataset_path, end_range, sub_url)

Loop: 1
St. Agatha, from the episode Holy Women_Baroque_Agostino Carracci_1.jpg : St. Agatha, from the episode Holy Women to save!
St. Agatha, from the episode Holy Women_Baroque_Agostino Carracci_1.jpg : St. Agatha, from the episode Holy Women Saved!
-------------------------------------
St. Catherine, from the episode Holy Women_Baroque_Agostino Carracci_2.jpg : St. Catherine, from the episode Holy Women to save!
St. Catherine, from the episode Holy Women_Baroque_Agostino Carracci_2.jpg : St. Catherine, from the episode Holy Women Saved!
-------------------------------------
St. Lucia, from the episode Holy Women_Baroque_Agostino Carracci_3.jpg : St. Lucia, from the episode Holy Women to save!
St. Lucia, from the episode Holy Women_Baroque_Agostino Carracci_3.jpg : St. Lucia, from the episode Holy Women Saved!
-------------------------------------
St. Margaret, from the episode Holy Women_Baroque_Agostino Carracci_4.jpg : St. Margaret, from the episode Holy Women to save!
St. Margare