# Splitting articles published in more than one issue

This scripts help to clean up pages and issue statements in bibliographic records of Die Gartenlaube especially for articles published splitted up over serveral issues.

[GitHub Issue](https://github.com/DieDatenlaube/DieDatenlaube/issues/4)

In [4]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import re

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT * WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?Die_Gartenlaube p:P1433 ?GartenlaubeStmt.
  ?GartenlaubeStmt ps:P1433 wd:Q655617;
    pq:P577 ?pubdate;
    pq:P304 ?psPages;
    pq:P433 ?psIssue;
    prov:wasDerivedFrom ?refnode.
  ?refnode pr:P4656 ?ref.
  ?Die_Gartenlaube wdt:P304 ?pages;
    wdt:P433 ?issue.
  FILTER(REGEX(?issue, ";|–"))
  FILTER(REGEX(?pages, ",|;"))
}
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent="#DieDatenlaube")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    wdid = re.sub("http://www.wikidata.org/entity/","",result["Die_Gartenlaube"]["value"])
    print(result["issue"]["value"])
    print(result["pages"]["value"])
    #issues = result["issue"]["value"].split("–|;") #means a range
    issues = re.split("–|–|;|,",result["issue"]["value"])

    #Überprüfen ob einzelne issues ebenfalls bereits Heftnr.reihen sind - dann Loop durch Element
    # wenn nicht, dann check ob gesamte Issue-Angabe Heftnrreihe ist, dann Loop
    issuesIt = []
    for issueStr in issues:
        issueComp = re.findall("–|–",issueStr)
        if issueComp:
            issues = []
            issuesStr = re.split("–",issueStr)
            #Hier zuerst issueStr splitten, dann loopen
            for i in range(int(issuesStr[0]),int(issuesStr[1])+1):
                issues.append(i)
        
    if(len(issuesIt)==0): 
        x = re.findall("–|–",result["issue"]["value"])
        if x:
            for i in range(int(issues[0]),int(issues[1])+1):
                #print(i)
                issuesIt.append(i)
            issues = issuesIt
    
    #issues = issuesIt
    print(issues)
            
    pageRanges = re.split(";|,",result["pages"]["value"]) #means a range
    pageRanges = list(filter(None,pageRanges)) #entfernt leere Listeneinträge
    print (pageRanges)
    try:
        if (len(issues)==len(pageRanges)):
            print(result["Die_Gartenlaube"]["value"]+"\tArrays are ident")
            print("-"+wdid+"\t"+"P1433\t"+"Q655617") #remove published in
            
            print("-"+wdid+"\t"+"P304"+"\t\""+result["pages"]["value"]+"\"") #remove pages
            print("-"+wdid+"\t"+"P433"+"\t\""+result["issue"]["value"]+"\"") #remove issues
            j = 0
            pagesQual = ""
            for i in issues:
                print(wdid+"\tP304"+"\t\""+pageRanges[j]+"\"\t"+"P433\t\""+str(i)+"\"") #add pages with issues qualifier
                print(wdid+"\tP433\t\""+str(i)+"\""+"\tP304"+"\t\""+pageRanges[j]+"\"") #add issues with pages qualifier
                j = j+1
                pagesQual = pagesQual + "\tP304"+"\t\""+pageRanges[j]
            #add new published in statement with spliitted pages and concated issue qualifier
            print(wdid+"\t"+"P1433"+"Q655617"+pagesQual+"\tP433\t"+result["issue"]["value"])
            #print(issues[0]+" bis "+issues[1])
            
            j = 0
            for i in range(int(issues[0]), int(issues[1])+1):
                #print(i)
                #print(pageRanges[j])
                j = j+1
        else:
            print(result["Die_Gartenlaube"]["value"]+"\tarrays are not similar")
            
        
    except:
        print("not a range")
    print("== Next Item ===================================================================================")


14–18
177–180; 193–196; 209–212;193–196;221–224; 233–239
[14, 15, 16, 17, 18]
['177–180', ' 193–196', ' 209–212', '193–196', '221–224', ' 233–239']
http://www.wikidata.org/entity/Q19188730	arrays are not similar
7–10
97–100, 113–116, 129–132, 145–148, 150
[7, 8, 9, 10]
['97–100', ' 113–116', ' 129–132', ' 145–148', ' 150']
http://www.wikidata.org/entity/Q19189979	arrays are not similar
4; 9
52–54; 122–123
['4', ' 9']
['52–54', ' 122–123']
http://www.wikidata.org/entity/Q19134045	Arrays are ident
-Q19134045	P1433Q655617
-Q19134045	P304	"52–54; 122–123"
-Q19134045	P433	"4; 9"
Q19134045	P304	"52–54"	P433	"4"
Q19134045	P433	"4"	P304	"52–54"
Q19134045	P304	" 122–123"	P433	" 9"
Q19134045	P433	" 9"	P304	" 122–123"
not a range
40–48
669–675, 685–691, 702–707, 718–723, 732–738, 751–755, 766–771, 784–787, 800–803
[40, 41, 42, 43, 44, 45, 46, 47, 48]
['669–675', ' 685–691', ' 702–707', ' 718–723', ' 732–738', ' 751–755', ' 766–771', ' 784–787', ' 800–803']
http://www.wikidata.org/entity/Q62617246

Q62611854	P433	"8"	P304	"124–128"
not a range
41–43
591–593, 608-609, 623-624
[41, 42, 43]
['591–593', ' 608-609', ' 623-624']
http://www.wikidata.org/entity/Q62611316	Arrays are ident
-Q62611316	P1433Q655617
-Q62611316	P304	"591–593, 608-609, 623-624"
-Q62611316	P433	"41–43"
Q62611316	P304	"591–593"	P433	"41"
Q62611316	P433	"41"	P304	"591–593"
Q62611316	P304	" 608-609"	P433	"42"
Q62611316	P433	"42"	P304	" 608-609"
Q62611316	P304	" 623-624"	P433	"43"
Q62611316	P433	"43"	P304	" 623-624"
not a range
36–43
575–578, 591–594, 607–610, 623–627, 653–657, 659–662, 675–678, 702–704
[36, 37, 38, 39, 40, 41, 42, 43]
['575–578', ' 591–594', ' 607–610', ' 623–627', ' 653–657', ' 659–662', ' 675–678', ' 702–704']
http://www.wikidata.org/entity/Q62613790	Arrays are ident
-Q62613790	P1433Q655617
-Q62613790	P304	"575–578, 591–594, 607–610, 623–627, 653–657, 659–662, 675–678, 702–704"
-Q62613790	P433	"36–43"
Q62613790	P304	"575–578"	P433	"36"
Q62613790	P433	"36"	P304	"575–578"
Q62613790	P304	" 591–594"	