In [None]:
import pandas as pd
from neo4j import GraphDatabase
import requests
from tqdm import tqdm
import itertools

In [None]:
tqdm.pandas()

In [None]:
driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=('neo4j', 'neo4j'))

# NEED TO DO A $4$ STAGE DATA RETRIEVAL STUDY:
<ol>
    <li>use url to check if the artwork is present or not within ArtGraph;</li>
    <li>For those artworks which are still not matched, use the <code>artist name</code>. Recursively, find out the given artwork in his library;
    <li>Not all the artworks will be retrieved. In some cases, the artist name is not correct. Please, use that one which come from the column <code>Artist</code> in the base info <code>DataFrame</code>;
    <li>Last chance. Use the value in the column <code>ID</code> to see whether or not the artwork is matched. The referring API is WikiArt version 2.
</ol>

Get artworks information

In [None]:
artwork_info = pd.read_csv('WikiArt-Emotions/WikiArt-info.tsv', delimiter='\t')

Name "andrea-del-verro**c**chio" contains typo.
the correct one is "andrea-del-verrochio" (with just **1** c)

In [None]:
artwork_info['Artist Info URL'] = artwork_info['Artist Info URL'].progress_apply(
    lambda x: x if 'andrea-del-verrocchio' not in x \
        else '/'.join(x.split('/')[:-1] + ['andrea-del-verrochio']))
artwork_info

In [None]:
def mod_painting_info(link):
    l = link.split('/')
    l[-2] = 'andrea-del-verrochio'
    return '/'.join(l)

In [None]:
def mod_painting_url(link):
    l = link.split('/')
    l[-2] = 'andrea-del-verrochio'
    return '/'.join(l)

Once modified the artist info link, we need to modidy also the painting info link and the painting image link.

In [None]:
artwork_info['Painting Info URL'] = artwork_info['Painting Info URL'].progress_apply(
    lambda x: x if 'andrea-del-verrocchio' not in x \
        else mod_painting_info(x))
artwork_info['Image URL'] = artwork_info['Image URL'].progress_apply(
    lambda x: x if 'andrea-del-verrocchio' not in x \
        else mod_painting_info(x))
artwork_info

Get name for trying to do a mapping with ArtGraph

In [None]:
artwork_info['name'] = artwork_info['Image URL'].map(lambda x: '_'.join(x.split('/')[-2:]))

Function that checks if a painting is present in ArtGraph

In [None]:
def is_in_artgraph(param, value, driver):
    base_query = f'match (a:Artwork) where a.{param} = "{value}" return count(distinct a) as num'
    with driver.session(database='neo4j') as session:
        ans = session.run(base_query).data()
    return ans[0]['num']

In [None]:
artwork_info['name_in_artgraph'] = artwork_info.progress_apply(lambda x: is_in_artgraph('name', x['name'], driver), axis=1)

There are two possible artist's name:
<li> the artist included into the <code>name</code> of the painting;
<li> the name of the artist, present in <code>Artist</code> column.

In [None]:
artwork_info['artist'] = artwork_info['name'].map(lambda x: x.split('_')[0])
artwork_info['artist_1'] = artwork_info['Artist'].map(lambda x: '-'.join(x.lower().split(' ')))

In [None]:
artwork_info['name'] = artwork_info['name'].map(lambda x: x.split('_')[1])

Taking artworks that are not present in artgraph

In [None]:
unretrieved = artwork_info[artwork_info.name_in_artgraph == 0]

Drop **useless** columns

In [None]:
unretrieved.drop(['Category', 'Artist', 'Year', 'Painting Info URL', 'Artist Info URL', 'name_in_artgraph','Image URL'],
                 axis = 1, inplace=True)

In [None]:
# function that gets all the paintings of a specific artist
def get_paintings_by_artist(artist_name):
    base_query = 'https://www.wikiart.org/en/App/Painting/PaintingsByArtist?artistUrl={artist}&json=2'
    return requests.get(base_query.format(artist=artist_name)).json()

# function that gets all painting infos for a specific artwork
def get_painting_info(painting_id: int):
    base_query = 'https://www.wikiart.org/en/App/Painting/ImageJson/{painting}'
    return requests.get(base_query.format(painting=painting_id)).json()

# function that gets all url unmatched artworks with respect to wikiart artwork info
def retrieve_artworks_by_url(artist_name, metadata):
    paintings = get_paintings_by_artist(artist_name)
    paintings = map(lambda x: x['contentId'], paintings)
    paintings = list(map(lambda x: f"{get_painting_info(x)['url']}.jpg", paintings))
    return metadata[metadata['name'].isin(paintings)].name.tolist()

# function that gets all unmatched artworks with respect to basic info got by searching just paintings of spceific artwork
def retrieve_artworks(artist_name, metadata):
    paintings = get_paintings_by_artist(artist_name)
    titles = list(map(lambda x: x['title'], paintings))
    names = list(map(lambda x: x['image'].split('/')[-1][:-10], paintings))
    return metadata[(metadata['Title'].isin(titles)) | (metadata['name'].isin(names))].name.tolist()

Group paintings by artist

In [None]:
grouped = unretrieved.groupby(['artist', 'artist_1'])[['Title', 'name']].apply(lambda x: x.values.tolist()).reset_index(name='metadata')

# STAGE $1$
## RETRIEVE THE ARTWORKS BASED ON ARTIST NAME (POINT $1$)

In [None]:
retrieved = grouped.progress_apply(lambda x: retrieve_artworks(x.artist, pd.DataFrame(x.metadata, columns=['Title', 'name'])), axis=1)

In [None]:
total = unretrieved.groupby(['artist', 'artist_1'])['name'].apply(list).reset_index(name='names')

placing the first retrieved artwroks

In [None]:
total['ret'] = retrieved

Making comparison and underline unretrieved artworks

In [None]:
total['difference'] = total.apply(lambda x: list(set(x.names) - set(x.ret)), axis=1)
total

In [None]:
#take unretrieved artworks
paintings_still_unretrieved = list(itertools.chain(*total.difference.values.tolist()))

#create a new column in the base dataframe, in which it is included the result
artwork_info['api_v1_artist'] = artwork_info.progress_apply(lambda x:\
                                    1 if x['name'] not in paintings_still_unretrieved and\
                                         x.name_in_artgraph == 0 else 0,
                                axis=1)

# STAGE $2$
## USING THE $2^{nd}$ ARTIST NAME

Filter and go on with next strategy

In [None]:
unretrieved = unretrieved[unretrieved.name.isin(paintings_still_unretrieved)]

In [None]:
grouped = unretrieved.groupby(['artist', 'artist_1'])[['Title', 'name']].apply(lambda x: x.values.tolist()).reset_index(name='metadata')
retrieved = grouped.progress_apply(lambda x: retrieve_artworks(x.artist_1, pd.DataFrame(x.metadata, columns=['Title', 'name'])), axis=1)

In [None]:
total = unretrieved.groupby(['artist', 'artist_1'])['name'].apply(list).reset_index(name='names')
total['ret'] = retrieved
total['difference'] = total.apply(lambda x: list(set(x.names) - set(x.ret)), axis=1)

In [None]:
total

In [None]:
total[total.difference.map(len) != 0]

# STAGE $3$
## RECURSIVE QUERY BASED TO URL

In [None]:
#list all painting that are still not retrieved
paintings_still_unretrieved = list(itertools.chain(*total.difference.values.tolist()))


In [None]:
#create a new column, underlying those artworks that have been retrieved using the stage 2
artwork_info['api_v1_artist_1'] = artwork_info.progress_apply(lambda x:\
                                    1 if x['name'] not in paintings_still_unretrieved and\
                                           x.api_v1_artist == 0 and\
                                           x.name_in_artgraph == 0 else 0,
                                axis=1)

In [None]:
#filtering and prepare for next stage
unretrieved = unretrieved[unretrieved.name.isin(paintings_still_unretrieved)]

In [None]:
grouped = unretrieved.groupby(['artist', 'artist_1'])[['Title', 'name']].apply(lambda x: x.values.tolist()).reset_index(name='metadata')
retrieved = grouped.progress_apply(lambda x: retrieve_artworks_by_url(x.artist, pd.DataFrame(x.metadata, columns=['Title', 'name'])), axis=1)

In [None]:
total = unretrieved.groupby(['artist', 'artist_1'])['name'].apply(list).reset_index(name='names')
total['ret'] = retrieved
total['difference'] = total.apply(lambda x: list(set(x.names) - set(x.ret)), axis=1)

In [None]:
total[total.difference.map(len) != 0]

# STAGE $4$
## FIND ARTWORKS BY ID, USING WIKIART API V2

In [None]:
paintings_still_unretrieved = list(itertools.chain(*total.difference.values.tolist()))
artwork_info['api_v1_url'] = artwork_info.progress_apply(lambda x:\
                                 1 if x['name'] not in paintings_still_unretrieved and\
                                      x.api_v1_artist_1 == 0 and\
                                      x.api_v1_artist == 0 and\
                                      x.name_in_artgraph == 0 else 0,
                             axis=1)

In [None]:
unretrieved = unretrieved[unretrieved.name.isin(paintings_still_unretrieved)]

In [None]:
def get_painting(id: str):
    base = 'https://www.wikiart.org/en/api/2/Painting?id={id}'
    return requests.get(base.format(id=id)).json()

In [None]:
unretrieved['api_v2'] = unretrieved.ID.progress_apply(lambda x: 'title' in get_painting(x))
unretrieved

In [None]:
artwork_info['api_v2'] = artwork_info.progress_apply(lambda x:\
                             1 if x['name'] not in unretrieved[unretrieved.api_v2==False]['name'].tolist() and\
                                  x.api_v1_url == 0 and\
                                  x.api_v1_artist_1 == 0 and\
                                  x.api_v1_artist == 0 and\
                                  x.name_in_artgraph == 0 else 0,
                         axis=1)

In [None]:
unretrieved[unretrieved.api_v2==False]

Save a dataframe that includes only artworks that are not retrieved in any way

In [None]:
unretrieved[unretrieved.api_v2==False].to_csv('still_unretrieved.csv')

In [None]:
artwork_info

Save a version of the artwork info dataframe, which includes the right data source in which all the metadata are stored

In [None]:
artwork_info.to_csv('artwork_info_sources.csv')