In [1]:
#!/usr/bin/python
# -*- coding: latin-1 -*-
"""
This notebook adds the WLE per contributor, per site and per autonomous 
community categories to all the files in the WLE 2017 log. It does not 
include the {{see also}} templates.
"""
import os, sys, inspect

current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
folder_parts = current_folder.split(os.sep)
pywikibot_folder = os.sep.join(folder_parts[0:-1])

if current_folder not in sys.path:
    sys.path.insert(0, current_folder)
if pywikibot_folder not in sys.path:
    sys.path.insert(0, pywikibot_folder)

import pywikibot as pb
from pywikibot import pagegenerators, textlib
from StringIO import StringIO
import mwparserfromhell as mwh
import pandas as pd
import csv
import numpy as np

In [2]:
BASE_WLE2017_NAME           = u"Commons:Wiki Loves Earth 2017 in Spain"
LOG_PAGE                    = BASE_WLE2017_NAME + u"/Log"

commons_site = pb.Site("commons", "commons")

In [3]:
annexes = {
   'ES-AN': [u'Anexo:Lugares de importancia comunitaria de Andalucía','Andalusia'],
   'ES-AR': [u'Anexo:Lugares de importancia comunitaria de Aragón','Aragon'],
   'ES-AS': [u'Anexo:Lugares de importancia comunitaria de Asturias','Asturias'],
   'ES-CB': [u'Anexo:Lugares de importancia comunitaria de Cantabria','Cantabria'],
   'ES-CM': [u'Anexo:Lugares de importancia comunitaria de Castilla-La Mancha','Castile-La Mancha'],
   'ES-CL': [u'Anexo:Lugares de importancia comunitaria de Castilla y León',u'Castile and León'],
   'ES-CT': [u'Anexo:Lugares de importancia comunitaria de Cataluña','Catalonia'],
   'ES-MD': [u'Anexo:Lugares de importancia comunitaria de la Comunidad de Madrid','Community of Madrid'],
   'ES-VC': [u'Anexo:Lugares de importancia comunitaria de la Comunidad Valenciana','Valencian Community'],
   'ES-EX': [u'Anexo:Lugares de importancia comunitaria de Extremadura','Extremadura'],
   'ES-IB': [u'Anexo:Lugares de importancia comunitaria de las Islas Baleares','Balearic Islands'],
   'ES-CN': [u'Anexo:Lugares de importancia comunitaria de las Islas Canarias','Canary Islands'],
   'ES-GA': [u'Anexo:Lugares de importancia comunitaria de Galicia','Galicia'],
   'ES-RI': [u'Anexo:Lugares de importancia comunitaria de La Rioja','La Rioja'],
   'ES-NC': [u'Anexo:Lugares de importancia comunitaria de Navarra','Navarre'],
   'ES-MC': [u'Anexo:Lugares de importancia comunitaria de la Región de Murcia','Region of Murcia'],
   'ES-PV': [u'Anexo:Lugares de importancia comunitaria del País Vasco','Basque Country'],
   'ES-ML': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla','Melilla'],
   'ES-CE': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla','Ceuta'],
   'ES-MAGRAMA': [u'Anexo:Lugares de importancia comunitaria del MAGRAMA','MAGRAMA']
}

In [4]:
pb.output('Retrieving --> WLE 2017 images list from cache')
list_page = pb.Page(commons_site, LOG_PAGE)
list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
images_df = pd.read_csv(list_page_text,
                        sep=";",
                        index_col=False,
                        names=['image_title', 'uploader', 'days_from_registration', 
                               'timestamp', 'code', 'name', 'community', 'commons_cat', 
                               'lat', 'long'],
                        encoding='utf-8',
                        quoting=csv.QUOTE_NONE)
images_df["timestamp"] = pd.to_datetime(images_df["timestamp"], format="%Y-%m-%d %H:%M:%S")
images_df.set_index(["timestamp"], inplace=True)
images_df['code'].fillna(u'', inplace=True)
images_df['name'].fillna(u'', inplace=True)
images_df['community'].fillna(u'', inplace=True)
images_df['lat'].fillna(0.0, inplace=True)
images_df['long'].fillna(0.0, inplace=True)
del images_df.index.name

images_df_lenght = len(images_df)
images_df = images_df.loc[(images_df.index > '2017-04-30 23:59:59') & (images_df.index < '2017-06-01 01:00:00')]
pb.output('Retrieved --> WLE 2017 image list from cache')

Retrieving --> WLE 2017 images list from cache
Retrieved --> WLE 2017 image list from cache


In [None]:
article_needed = [u'Region', u'Basque', u'Balearic', u'Canary', u'Valencian', u'Community']

counter = 0
for _, row in images_df.iterrows():
    page = pb.Page(commons_site, row["image_title"], ns=6)
    if (counter != 0) and (counter % 50 == 0) :
        pb.output ('Retrieving --> %d image descriptions downloaded' %(counter))
    text = page.text
    cats = [cat.title() for cat in textlib.getCategoryLinks(text)]
    #print cats
    wikicode = mwh.parse(text)
    
    category_string = u''
    if row["code"] != u'' :
        site_category = u'Category:Images of site of community importance with code {0} from Wiki Loves Earth 2017 in Spain'.format(row["code"])
        site_category_string = u'\n[[{0}]]'.format(site_category)
        if len ([i for i in article_needed if community.startswith(i)]) == 0 :
            community_category = u'Category:Images of a site of community importance in {0} from Wiki Loves Earth 2017 in Spain'.format(row["community"])
        else :
            community_category = u'Category:Images of a site of community importance in the {0} from Wiki Loves Earth 2017 in Spain'.format(row["community"])
        community_category_string = u'\n[[{0}]]'.format(community_category)
        #print site_category
        #print community_category
        if site_category not in cats:
            category_string += site_category_string
        if community_category not in cats:
            category_string += community_category_string
        #print category_string
    
    author_category = u'Category:Images from Wiki Loves Earth 2017 in Spain by {0}'.format(row["uploader"])
    author_category_string = u'\n[[{0}]]'.format(author_category)
    if author_category not in cats:
        category_string += author_category_string
        
    counter += 1
      
    if len (category_string) > 0:
        page.text = text + category_string
        page.save("WLE Spain 2017: user category management")

In [29]:
unique_sites = images_df[images_df["code"] != u'']["code"].unique()
unique_names = images_df[images_df["code"] != u'']["name"].unique()
for index, element in np.ndenumerate(unique_sites):
    cat_text = u"'''Site of Community Importance''': {1} ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site={0} {0}])\n\n{{{{hiddencat}}}}\n[[Category:Images from Wiki Loves Earth 2017 in Spain by site| {0}]]".format(element, unique_names[index])
    page = pb.Page(commons_site, u'Category:Images of site of community importance with code {0} from Wiki Loves Earth 2017 in Spain'.format(element))
    page.text = cat_text
    page.save("WLE Spain 2017: site category creation")

Page [[Category:Images of site of community importance with code ES0000001 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 9.2 seconds, 2017-06-11 16:56:10
Page [[Category:Images of site of community importance with code ES5310125 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 9.1 seconds, 2017-06-11 16:56:20
Page [[Category:Images of site of community importance with code ES5110012 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 9.3 seconds, 2017-06-11 16:56:30
Page [[Category:Images of site of community importance with code ES0000035 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 9.3 seconds, 2017-06-11 16:56:40
Page [[Category:Images of site of community importance with code ES1200001 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 9.2 seconds, 2017-06-11 16:56:50
Page [[Category:Images of site of community importance with code ES4250001 from Wiki Loves Earth 2017 in Spain]] saved
Sleeping for 8.8 seconds, 2017-06-11 16:57:00
Page [[Cat

In [None]:
unique_communities = images_df[images_df["community"] != u'']["community"].unique()
article_needed = [u'Region', u'Basque', u'Balearic', u'Canary', u'Valencian', u'Community']

for community in unique_communities:
    cat_text = u"{{{{hiddencat}}}}\n[[Category:Images from Wiki Loves Earth 2017 in Spain by autonomous community| {0}]]".format(community)
    if len ([i for i in article_needed if community.startswith(i)]) == 0 :
        page = pb.Page(commons_site, u'Category:Images of a site of community importance in {0} from Wiki Loves Earth 2017 in Spain'.format(community))
    else :
        page = pb.Page(commons_site, u'Category:Images of a site of community importance in the {0} from Wiki Loves Earth 2017 in Spain'.format(community))
    page.text = cat_text
    page.save("WLE Spain 2017: community category creation")