In [1]:
#!/usr/bin/python
# -*- coding: latin-1 -*-

import os, sys, inspect

current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
folder_parts = current_folder.split(os.sep)
pywikibot_folder = os.sep.join(folder_parts[0:-1])

if current_folder not in sys.path:
    sys.path.insert(0, current_folder)
if pywikibot_folder not in sys.path:
    sys.path.insert(0, pywikibot_folder)

import pywikibot as pb
from pywikibot import pagegenerators, textlib
from StringIO import StringIO
import mwparserfromhell as mwh
import pandas as pd

In [2]:
WLE_CATEGORY = u"Category:Images from Wiki Loves Earth 2017 in Spain"
WLE_UQ_CATEGORY_1 = u"Category:Unqualified images from Wiki Loves Earth 2017 in Spain (not from a site of community importance)"
NO_TEMPLATE_CATEGORY = u"Category:Images from Wiki Loves Earth 2017 in Spain without valid template"
NO_ID_CATEGORY = u"Category:Images from Wiki Loves Earth 2017 in Spain without code"
WRONG_ID_CATEGORY = u"Category:Images from Wiki Loves Earth 2017 in Spain with a wrong code"

NO_TEMPLATE_CATEGORY_STRING = u"\n[[%s]]" % (NO_TEMPLATE_CATEGORY)
NO_ID_CATEGORY_STRING = u"\n[[%s]]" % (NO_ID_CATEGORY)
WRONG_ID_CATEGORY_STRING = u"\n[[%s]]" % (WRONG_ID_CATEGORY)

commons_site = pb.Site("commons", "commons")

In [3]:
BASE_WLE2016_NAME   = u"Commons:Wiki Loves Earth 2016 in Spain"
SCI_DB_PAGE         = BASE_WLE2016_NAME + u"/SCI DB"

# we read the already created SCI database and create a dataframe
sci_list_page = pb.Page(commons_site, SCI_DB_PAGE)
sci_list_text = StringIO(sci_list_page.text[sci_list_page.text.find('\n') + 
                                                      1:sci_list_page.text.rfind('\n')])
sci_df = pd.read_csv(sci_list_text, sep=";",
                     index_col=False,
                     names=["name", "code", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])
codes = sci_df["code"].tolist()

In [9]:
# Retrieving author name
cat_wle = pb.Category(commons_site, WLE_CATEGORY)
gen_wle = pagegenerators.CategorizedPageGenerator(cat_wle)

authors_wle = [page.oldest_revision["user"] for page in gen_wle if page.isImage()]
authors_wle = set(authors_wle)

In [4]:
# Retrieving images from the WLE category
pb.output('Retrieving --> WLE 2017 images from category')
cat_uq_wle_1 = pb.Category(commons_site, WLE_UQ_CATEGORY_1)
gen_uq_wle_1 = pagegenerators.CategorizedPageGenerator(cat_uq_wle_1)

# Retrieving already unqualified images
images_notvalid_1 = [page.title(withNamespace=True) for page in gen_uq_wle_1 if page.isImage()]
len(images_notvalid_1)

Retrieving --> WLE 2017 images from category


51

In [5]:
# Retrieving images without LIC template
cat_uq_wle_2 = pb.Category(commons_site, NO_TEMPLATE_CATEGORY)
gen_uq_wle_2 = pagegenerators.CategorizedPageGenerator(cat_uq_wle_2)

images_notvalid_2 = [page.title(withNamespace=True) for page in gen_uq_wle_2 if page.isImage()]
len(images_notvalid_2)

37

In [6]:
# Retrieving images with LIC template and a code without proper format
cat_uq_wle_3 = pb.Category(commons_site, NO_ID_CATEGORY)
gen_uq_wle_3 = pagegenerators.CategorizedPageGenerator(cat_uq_wle_3)

images_notvalid_3 = [page.title(withNamespace=True) for page in gen_uq_wle_3 if page.isImage()]
len(images_notvalid_3)

109

In [7]:
# Retrieving images with LIC template and a wrong code
cat_uq_wle_4 = pb.Category(commons_site, WRONG_ID_CATEGORY)
gen_uq_wle_4 = pagegenerators.CategorizedPageGenerator(cat_uq_wle_4)

images_notvalid_4 = [page.title(withNamespace=True) for page in gen_uq_wle_4 if page.isImage()]
len(images_notvalid_4)

24

In [8]:
# Retrieving valid images
cat_wle = pb.Category(commons_site, WLE_CATEGORY)
gen_wle = pagegenerators.CategorizedPageGenerator(cat_wle)

images_wle = [page.title(withNamespace=True) for page in gen_wle if page.isImage() and 
              page.title(withNamespace=True) not in images_notvalid_1 and
              page.title(withNamespace=True) not in images_notvalid_2 and
              page.title(withNamespace=True) not in images_notvalid_3 and
              page.title(withNamespace=True) not in images_notvalid_4
             ]
len(images_wle)

2153

In [18]:
for image in images_wle:
    page = pb.Page(commons_site, image)
    text = page.text
    wikicode = mwh.parse(text)
    templates = wikicode.filter_templates()
    WLE_template_found = False
    WLE_identifier = ''
    for template in templates :
        #print("Template:", template.name.lower().strip())
        if template.name.lower().strip() == u"lic" :
            WLE_template_found = True
            WLE_identifier = template.get(1).value.strip()
            break

    if WLE_template_found == False :
        print ("--> Not found")
        page.text = text + NO_TEMPLATE_CATEGORY_STRING
        page.save("WLE Spain 2017: No WLE templage found")
    elif WLE_identifier.startswith('ES') == False :
        print ("--> Not found")
        page.text = text + NO_ID_CATEGORY_STRING
        page.save("WLE Spain 2017: No WLE identifier found")
    elif WLE_identifier not in codes :
        print ("--> Wrong")
        page.text = text + WRONG_ID_CATEGORY_STRING
        page.save("WLE Spain 2017: Wrong WLE identifier found")

Sleeping for 63.0 seconds, 2017-06-01 23:13:17
