In [1]:
# determine stats on how many linked creators can pull summaries or images, and for those what license.

import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

wikidata_query = ''' select ?wikidata_id ?acmi_id where { ?wikidata_id wdt:P7003 ?acmi_id } '''
wikidata_creators = sparql_query(wikidata_query, 'https://query.wikidata.org/sparql')
wikidata_creators = wikidata_creators.loc[wikidata_creators.acmi_id.str.contains('creator', na=False)]
wikidata_creators['wikidata_id'] = wikidata_creators['wikidata_id'].str.split('/').str[-1]

print(len(wikidata_creators))
wikidata_creators.head()

4900


Unnamed: 0,wikidata_id,acmi_id
0,Q6255888,creators/21267
1,Q63069,creators/32979
2,Q2158449,creators/80769
3,Q3195038,creators/15007
4,Q3672555,creators/22745


In [2]:

dataframe = pandas.DataFrame()

for chunk in tqdm.tqdm(numpy.array_split(wikidata_creators.wikidata_id.unique(), 20)):

    time.sleep(4)

    chunk_creator = ' '.join(['wd:'+x for x in chunk])

    query = '''
        select ?wikidata_creator ?image ?imageLicense  
        where {
            values ?wikidata_creator {'''+chunk_creator+'''}
            optional { ?wikidata_creator wdt:P18 ?image .
                bind(strafter(wikibase:decodeUri(str(?image)), "http://commons.wikimedia.org/wiki/Special:FilePath/") AS ?fileTitle)
                service wikibase:mwapi {
                    bd:serviceParam wikibase:endpoint "commons.wikimedia.org";
                    wikibase:api "Generator";
                    wikibase:limit "once";
                    mwapi:generator "allpages";
                    mwapi:gapfrom ?fileTitle;
                    mwapi:gapnamespace 6; # NS_FILE
                    mwapi:gaplimit 1;
                    mwapi:prop "imageinfo";
                    mwapi:iiprop "extmetadata" .
                    ?imageLicense wikibase:apiOutput "imageinfo/ii/extmetadata/License/@value".
                }
            }
        } '''
    
    dataframe = pandas.concat([dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])

image_available = len(dataframe.loc[dataframe.image.str.contains('wiki', na=False)])
print(image_available, len(dataframe), round(image_available/len(dataframe), 2))

dataframe = dataframe.dropna()
print(len('----'))
for x in dataframe.imageLicense.unique():
    section = dataframe.loc[dataframe.imageLicense.isin([x])]
    print(x, '---', round(len(section)/len(dataframe), 2))

dataframe.head()

100%|██████████| 20/20 [03:35<00:00, 10.76s/it]

1895 4880 0.39
4
pd --- 0.32
cc-by-sa-3.0 --- 0.21
cc-by-3.0 --- 0.06
cc-by-sa-2.0 --- 0.1
cc-by-2.0 --- 0.11
cc-by-sa-4.0 --- 0.13
cc-by-2.5 --- 0.01
cc-by-sa-1.0 --- 0.0
cc-by-4.0 --- 0.03
cc0 --- 0.02
cc-by-sa-3.0-de --- 0.0
cc-by-sa-2.5 --- 0.01
cc-by-3.0-us --- 0.0
cc-by-sa-3.0-nl --- 0.01
cc-by-sa-2.0-fr --- 0.0
cc-by-2.5-dk --- 0.0
cc-by-3.0-br --- 0.0
cc-by-1.0 --- 0.0
cc-by-sa-3.0-rs --- 0.0
cc-by-sa-2.0-de --- 0.0
cc-by-3.0-au --- 0.0
cc-by-2.5-pl --- 0.0
cc-by-3.0-de --- 0.0





Unnamed: 0,wikidata_creator,image,imageLicense
0,http://www.wikidata.org/entity/Q72229,http://commons.wikimedia.org/wiki/Special:File...,pd
1,http://www.wikidata.org/entity/Q72291,http://commons.wikimedia.org/wiki/Special:File...,pd
2,http://www.wikidata.org/entity/Q72579,http://commons.wikimedia.org/wiki/Special:File...,pd
3,http://www.wikidata.org/entity/Q73089,http://commons.wikimedia.org/wiki/Special:File...,pd
4,http://www.wikidata.org/entity/Q73136,http://commons.wikimedia.org/wiki/Special:File...,pd


In [3]:
# find wikipedia links per linked creator.

dataframe = pandas.DataFrame()
for chunk in tqdm.tqdm(numpy.array_split(wikidata_creators.wikidata_id.unique(), 20)):
    time.sleep(4)
    chunk_creator = ' '.join(['wd:'+x for x in chunk])
    query = '''
        select ?wikidata_creator ?article   
        where {
            values ?wikidata_creator {'''+chunk_creator+'''}
                optional { ?article schema:about ?wikidata_creator .
            ?article schema:isPartOf <https://en.wikipedia.org/> } .

        } '''
    dataframe = pandas.concat([dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])

wiki_available = len(dataframe.loc[dataframe.article.str.contains('wiki', na=False)])
print(wiki_available, len(dataframe), round(wiki_available/len(dataframe), 2))

print(len(dataframe))
dataframe.head()

100%|██████████| 20/20 [01:53<00:00,  5.67s/it]

3912 4821 0.81
4821





Unnamed: 0,wikidata_creator,article
0,http://www.wikidata.org/entity/Q73089,https://en.wikipedia.org/wiki/Gene_Kelly
1,http://www.wikidata.org/entity/Q2001,https://en.wikipedia.org/wiki/Stanley_Kubrick
2,http://www.wikidata.org/entity/Q72229,https://en.wikipedia.org/wiki/Edward_Dmytryk
3,http://www.wikidata.org/entity/Q72291,https://en.wikipedia.org/wiki/David_O._Selznick
4,http://www.wikidata.org/entity/Q72579,https://en.wikipedia.org/wiki/Samuel_Goldwyn
