# Splitting articles published in more than one issue

This scripts help to clean up pages and issue statements in bibliographic records of Die Gartenlaube especially for articles published splitted up over serveral issues.

[GitHub Issue](https://github.com/DieDatenlaube/DieDatenlaube/issues/4)

In [1]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import time

QSFile = 'output/qs_issuePages_'+str(time.time())[0:10]+'.tsv'
f = open(QSFile,'a')

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT * WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?Die_Gartenlaube p:P1433 ?GartenlaubeStmt.
  ?GartenlaubeStmt ps:P1433 wd:Q655617;
    pq:P577 ?pubdate;
    pq:P304 ?psPages;
    pq:P433 ?psIssue;
    prov:wasDerivedFrom ?refnode.
  ?refnode pr:P4656 ?ref.
  ?Die_Gartenlaube wdt:P304 ?pages;
    wdt:P433 ?issue.
  FILTER(REGEX(?issue, ";|–"))
  FILTER(REGEX(?pages, ",|;"))
}
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent="#DieDatenlaube")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    wdid = re.sub("http://www.wikidata.org/entity/","",result["Die_Gartenlaube"]["value"])
    refqs = "\tS143\tQ15522295\tS4656\t\""+result["ref"]["value"]+"\""
    #print(result["issue"]["value"])
    #print(result["pages"]["value"])
    #issues = result["issue"]["value"].split("–|;") #means a range
    issues = re.split("–|–|;|,",result["issue"]["value"])

    #Überprüfen ob einzelne issues ebenfalls bereits Heftnr.reihen sind - dann Loop durch Element
    # wenn nicht, dann check ob gesamte Issue-Angabe Heftnrreihe ist, dann Loop
    issuesIt = []
    for issueStr in issues:
        issueComp = re.findall("–|–",issueStr)
        if issueComp:
            issues = []
            issuesStr = re.split("–",issueStr)
            #Hier zuerst issueStr splitten, dann loopen
            for i in range(int(issuesStr[0]),int(issuesStr[1])+1):
                issues.append(i)
        
    if(len(issuesIt)==0): 
        x = re.findall("–|–",result["issue"]["value"])
        if x:
            for i in range(int(issues[0]),int(issues[1])+1):
                #print(i)
                issuesIt.append(i)
            issues = issuesIt
    
    #issues = issuesIt
    #print(issues)
            
    pageRanges = re.split(";|,",result["pages"]["value"]) #means a range
    pageRanges = list(filter(None,pageRanges)) #entfernt leere Listeneinträge
    #print (pageRanges)
    try:
        if (len(issues)==len(pageRanges)):
            print(result["Die_Gartenlaube"]["value"]+"\tLengths of Arrays are ident\t"+str(issues)+"\t"+str(pageRanges))
            print("#"+result["Die_Gartenlaube"]["value"]+"\tLengths of Arrays are ident\t"+str(issues)+"\t"+str(pageRanges),file=f)
            print("-"+wdid+"\t"+"P1433\t"+"Q655617",file=f) #remove published in
            
            print("-"+wdid+"\t"+"P304"+"\t\""+result["pages"]["value"]+"\"",file=f) #remove pages
            print("-"+wdid+"\t"+"P433"+"\t\""+result["issue"]["value"]+"\"",file=f) #remove issues
            j = 0
            pagesQual = ""
            for i in issues:
                print(wdid+"\tP304"+"\t\""+pageRanges[j].strip()+"\"\t"+"P433\t\""+str(i).strip()+"\""+refqs,file=f) #add pages with issues qualifier
                print(wdid+"\tP433\t\""+str(i).strip()+"\""+"\tP304"+"\t\""+pageRanges[j].strip()+"\""+refqs,file=f) #add issues with pages qualifier
                pagesQual = pagesQual + "\tP304"+"\t\""+pageRanges[j].strip()+"\""
                j = j+1
            #add new published in statement with spliitted pages and concated issue qualifier
            print(wdid+"\t"+"P1433\t"+"Q655617"+pagesQual+"\tP433\t\""+result["issue"]["value"]+"\""+"\tP577\t+"+result["pubdate"]["value"]+refqs,file=f)
            #print(issues[0]+" bis "+issues[1])
            
            j = 0
            for i in range(int(issues[0]), int(issues[1])+1):
                #print(i)
                #print(pageRanges[j])
                j = j+1
        else:
            print(result["Die_Gartenlaube"]["value"]+"\tLengths of Arrays are not similar\t"+str(issues)+"\t"+str(pageRanges))
            
        
    except:
        print("not a range")
    #print("== Next Item ===================================================================================")


http://www.wikidata.org/entity/Q19189979	Lengths of Arrays are not similar	[7, 8, 9, 10]	['97–100', ' 113–116', ' 129–132', ' 145–148', ' 150']
http://www.wikidata.org/entity/Q19215738	Lengths of Arrays are not similar	['13', ' 1857/7 + 8']	['172–174', ' 99-100', ' 110-112']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not similar	[3, 4]	['44–46', ' 60–61', ' 65']
http://www.wikidata.org/entity/Q19228918	Lengths of Arrays are not sim