-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding Nautilus Retriever to original library wide object [but withou…
…t cache capabilities]
- Loading branch information
1 parent
523d4e1
commit 388a8a3
Showing
202 changed files
with
106,377 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
""" | ||
""" | ||
import io | ||
|
||
from MyCapytain.common.utils import xmlparser | ||
from MyCapytain.resources.collections.cts import TextInventory, TextGroup, Work, Citation | ||
from MyCapytain.resources.texts.locals.tei import Text | ||
from MyCapytain.resolvers.prototypes import Resolver | ||
from MyCapytain.errors import InvalidURN | ||
from MyCapytain.common.reference import URN | ||
from glob import glob | ||
import os.path | ||
from math import ceil | ||
import logging | ||
|
||
|
||
class CTSCapitainsLocalResolver(Resolver): | ||
""" XML Folder Based resolver. | ||
:param resource: Resource should be a list of folders retaining data as Capitains Guidelines Repositories | ||
:type resource: [str] | ||
:param name: Key used to differentiate Repository and thus enabling different repo to be used | ||
:type name: str | ||
:param logger: Logging object | ||
:type logger: logging | ||
:cvar TEXT_CLASS: Text Class [not instantiated] to be used to parse Texts. Can be changed to support Cache for example | ||
:type TEXT_CLASS: class | ||
:cvar DEFAULT_PAGE: Default Page to show | ||
:cvar PER_PAGE: Tuple representing the minimal number of texts returned, the default number and the maximum number of texts returned | ||
""" | ||
TEXT_CLASS = Text | ||
DEFAULT_PAGE = 1 | ||
PER_PAGE = (1, 10, 100) # Min, Default, Mainvex, | ||
|
||
@property | ||
def inventory(self): | ||
return self.__inventory__ | ||
|
||
@property | ||
def texts(self): | ||
return self.__texts__ | ||
|
||
def __init__(self, resource, name=None, logger=None): | ||
""" Initiate the XMLResolver | ||
""" | ||
self.__inventory__ = TextInventory() | ||
self.__texts__ = [] | ||
self.name = name | ||
|
||
self.logger = logger | ||
if not logger: | ||
self.logger = logging.getLogger(name) | ||
|
||
if not name: | ||
self.name = "repository" | ||
|
||
self.TEXT_CLASS = type(self).TEXT_CLASS | ||
self.works = [] | ||
|
||
self.parse(resource) | ||
|
||
def xmlparse(self, file): | ||
""" Parse a XML file | ||
:param file: Opened File | ||
:return: Tree | ||
""" | ||
return xmlparser(file) | ||
|
||
def parse(self, resource): | ||
""" Parse a list of directories ans | ||
:param resource: List of folders | ||
:param cache: Auto cache the results | ||
:return: An inventory resource and a list of Text metadata-objects | ||
""" | ||
for folder in resource: | ||
textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) | ||
for __cts__ in textgroups: | ||
try: | ||
with io.open(__cts__) as __xml__: | ||
textgroup = TextGroup( | ||
resource=__xml__ | ||
) | ||
str_urn = str(textgroup.urn) | ||
if str_urn in self.inventory.textgroups: | ||
self.inventory.textgroups[str_urn].update(textgroup) | ||
else: | ||
self.inventory.textgroups[str_urn] = textgroup | ||
|
||
for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): | ||
with io.open(__subcts__) as __xml__: | ||
work = Work( | ||
resource=__xml__, | ||
parents=[self.inventory.textgroups[str_urn]] | ||
) | ||
work_urn = str(work.urn) | ||
if work_urn in self.inventory.textgroups[str_urn].works: | ||
self.inventory.textgroups[str_urn].works[work_urn].update(work) | ||
else: | ||
self.inventory.textgroups[str_urn].works[work_urn] = work | ||
|
||
for __textkey__ in work.texts: | ||
__text__ = self.inventory.textgroups[str_urn].works[work_urn].texts[__textkey__] | ||
__text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( | ||
directory=os.path.dirname(__subcts__), | ||
textgroup=__text__.urn.textgroup, | ||
work=__text__.urn.work, | ||
version=__text__.urn.version | ||
) | ||
if os.path.isfile(__text__.path): | ||
try: | ||
with io.open(__text__.path) as f: | ||
t = Text(resource=self.xmlparse(f)) | ||
cites = list() | ||
for cite in [c for c in t.citation][::-1]: | ||
if len(cites) >= 1: | ||
cites.append(Citation( | ||
xpath=cite.xpath.replace("'", '"'), | ||
scope=cite.scope.replace("'", '"'), | ||
name=cite.name, | ||
child=cites[-1] | ||
)) | ||
else: | ||
cites.append(Citation( | ||
xpath=cite.xpath.replace("'", '"'), | ||
scope=cite.scope.replace("'", '"'), | ||
name=cite.name | ||
)) | ||
__text__.citation = cites[-1] | ||
self.logger.info("%s has been parsed ", __text__.path) | ||
if __text__.citation: | ||
self.texts.append(__text__) | ||
else: | ||
self.logger.error("%s has no passages", __text__.path) | ||
except Exception: | ||
self.logger.error( | ||
"%s does not accept parsing at some level (most probably citation) ", | ||
__text__.path | ||
) | ||
else: | ||
self.logger.error("%s is not present", __text__.path) | ||
except Exception as E: | ||
self.logger.error("Error parsing %s ", __cts__) | ||
|
||
return self.inventory, self.texts | ||
|
||
def __getText__(self, urn): | ||
""" Returns a Text object | ||
:param urn: URN of a text to retrieve | ||
:type urn: str, URN | ||
:return: Textual resource and metadata | ||
:rtype: (text.Text, inventory.Text) | ||
""" | ||
if not isinstance(urn, URN): | ||
urn = URN(urn) | ||
if len(urn) != 5: | ||
raise InvalidURN | ||
|
||
text = self.inventory[str(urn)] | ||
with io.open(text.path) as __xml__: | ||
resource = self.TEXT_CLASS(urn=urn, resource=self.xmlparse(__xml__)) | ||
|
||
return resource, text | ||
|
||
def __getCapabilities__(self, | ||
urn=None, page=None, limit=None, | ||
lang=None, category=None, pagination=False | ||
): | ||
""" Retrieve a slice of the inventory filtered by given arguments | ||
:param urn: Partial URN to use to filter out resources | ||
:type urn: str | ||
:param page: Page to show | ||
:type page: int | ||
:param limit: Item Per Page | ||
:type limit: int | ||
:param inventory: Inventory name | ||
:type inventory: str | ||
:param lang: Language to filter on | ||
:type lang: str | ||
:param category: Type of elements to show | ||
:type category: str | ||
:param pagination: Activate pagination | ||
:type pagination: bool | ||
:return: ([Matches], Page, Count) | ||
:rtype: ([Text], int, int) | ||
""" | ||
__PART = None | ||
if urn is not None: | ||
_urn = URN(urn) | ||
__PART = [None, None, URN.NAMESPACE, URN.TEXTGROUP, URN.WORK, URN.VERSION, URN.COMPLETE][len(_urn)] | ||
|
||
matches = [ | ||
text | ||
for text in self.__texts__ | ||
if (lang is None or (lang is not None and lang == text.lang)) and | ||
(urn is None or (urn is not None and text.urn.upTo(__PART) == urn)) and | ||
(text.citation is not None) and | ||
(category not in ["edition", "translation"] or (category in ["edition", "translation"] and category.lower() == text.subtype.lower())) | ||
] | ||
if pagination: | ||
start_index, end_index, page, count = type(self).pagination(page, limit, len(matches)) | ||
else: | ||
start_index, end_index, page, count = None, None, 0, len(matches) | ||
|
||
return matches[start_index:end_index], page, count | ||
|
||
@staticmethod | ||
def pagination(page, limit, length): | ||
""" Help for pagination | ||
:param page: Provided Page | ||
:param limit: Number of item to show | ||
:param length: Length of the list to paginate | ||
:return: (Start Index, End Index, Page Number, Item Count) | ||
""" | ||
realpage = page | ||
page = page or CTSCapitainsLocalResolver.DEFAULT_PAGE | ||
limit = limit or CTSCapitainsLocalResolver.PER_PAGE[1] | ||
|
||
if limit < CTSCapitainsLocalResolver.PER_PAGE[0] or limit > CTSCapitainsLocalResolver.PER_PAGE[2]: | ||
limit = CTSCapitainsLocalResolver.PER_PAGE[1] | ||
|
||
page = (page - 1) * limit | ||
|
||
if page > length: | ||
realpage = int(ceil(length / limit)) | ||
page = limit * (realpage - 1) | ||
count = length - 1 | ||
elif limit - 1 + page < length: | ||
count = limit - 1 + page | ||
else: | ||
count = length - 1 | ||
|
||
return page, count + 1, realpage, count - page + 1 | ||
|
||
def getMetadata(self, objectId=None, **filters): | ||
return self.__getCapabilities__(urn=objectId)[0] | ||
|
||
def getPassage(self, textId, subreference=None, prevnext=False, metadata=False): | ||
return self.__getText__(textId).getPassage(Reference(subreference)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import unicode_literals | ||
|
||
from MyCapytain.resolvers.cts.local import CTSCapitainsLocalResolver | ||
from MyCapytain.common.utils import Mimetypes | ||
from MyCapytain.common.reference import URN, Reference | ||
from unittest import TestCase | ||
|
||
|
||
class TestXMLFolderResolver(TestCase): | ||
def test_resource_parser(self): | ||
""" Test that the initiation finds correctly the resources """ | ||
Repository = CTSCapitainsLocalResolver(["./tests/testing_data/farsiLit"]) | ||
self.assertEqual( | ||
Repository.inventory["urn:cts:farsiLit:hafez"].urn, URN("urn:cts:farsiLit:hafez"), | ||
"Hafez is found" | ||
) | ||
self.assertEqual( | ||
len(Repository.inventory["urn:cts:farsiLit:hafez"].works), 1, | ||
"Hafez has one child" | ||
) | ||
self.assertEqual( | ||
Repository.inventory["urn:cts:farsiLit:hafez.divan"].urn, URN("urn:cts:farsiLit:hafez.divan"), | ||
"Divan is found" | ||
) | ||
self.assertEqual( | ||
len(Repository.inventory["urn:cts:farsiLit:hafez.divan"].texts), 3, | ||
"Divan has 3 children" | ||
) | ||
|
||
def test_text_resource(self): | ||
""" Test to get the text resource to perform other queries """ | ||
Repository = CTSCapitainsLocalResolver(["./tests/testing_data/farsiLit"]) | ||
text, metadata = Repository.__getText__("urn:cts:farsiLit:hafez.divan.perseus-eng1") | ||
self.assertEqual( | ||
len(text.citation), 4, | ||
"Object has a citation property of length 4" | ||
) | ||
self.assertEqual( | ||
text.getPassage(Reference("1.1.1.1")).export(output=Mimetypes.PLAINTEXT), | ||
"Ho ! Saki, pass around and offer the bowl (of love for God) : ### ", | ||
"It should be possible to retrieve text" | ||
) | ||
|
||
def test_get_capabilities(self): | ||
""" Check Get Capabilities """ | ||
Repository = CTSCapitainsLocalResolver( | ||
["./tests/testing_data/farsiLit"] | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__()[0]), 4, | ||
"General no filter works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(category="edition")[0]), 2, | ||
"Type filter works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(lang="ger")[0]), 1, | ||
"Filtering on language works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(category="edition", lang="ger")[0]), 0, | ||
"Type filter + lang works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(category="translation", lang="ger")[0]), 1, | ||
"Type filter + lang works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(page=1, limit=2, pagination=True)[0]), 2, | ||
"Pagination works without other filters" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(page=2, limit=2, pagination=True)[0]), 2, | ||
"Pagination works without other filters at list end" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(urn="urn:cts:farsiLit")[0]), 3, | ||
"URN Filtering works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(urn="urn:cts:latinLit")[0]), 1, | ||
"URN Filtering works" | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(urn="urn:cts:farsiLit:hafez.divan.perseus-eng1")[0]), 1, | ||
"Complete URN filtering works" | ||
) | ||
|
||
def test_get_shared_textgroup_cross_repo(self): | ||
""" Check Get Capabilities """ | ||
Repository = CTSCapitainsLocalResolver( | ||
[ | ||
"./tests/testing_data/farsiLit", | ||
"./tests/testing_data/latinLit2" | ||
] | ||
) | ||
self.assertIsNotNone( | ||
Repository.__getText__("urn:cts:latinLit:phi1294.phi002.perseus-lat2"), | ||
"We should find perseus-lat2" | ||
) | ||
self.assertIsNotNone( | ||
Repository.__getText__("urn:cts:latinLit:phi1294.phi002.opp-lat2"), | ||
"We should find perseus-lat2" | ||
) | ||
|
||
def test_get_capabilities_nocites(self): | ||
""" Check Get Capabilities latinLit data""" | ||
Repository = CTSCapitainsLocalResolver( | ||
["./tests/testing_data/latinLit"] | ||
) | ||
self.assertEqual( | ||
len(Repository.__getCapabilities__(urn="urn:cts:latinLit:stoa0045.stoa008.perseus-lat2")[0]), 0, | ||
"Texts without citations were ignored" | ||
) | ||
|
||
def test_pagination(self): | ||
self.assertEqual( | ||
CTSCapitainsLocalResolver.pagination(2, 30, 150), (30, 60, 2, 30), | ||
" Pagination should return Array limits " | ||
) | ||
self.assertEqual( | ||
CTSCapitainsLocalResolver.pagination(4, 40, 150), (120, 150, 4, 30), | ||
" Pagination should return Array limits " | ||
) | ||
self.assertEqual( | ||
CTSCapitainsLocalResolver.pagination(5, 40, 150), (120, 150, 4, 30), | ||
" Pagination should return Array limits " | ||
) | ||
self.assertEqual( | ||
CTSCapitainsLocalResolver.pagination(5, 100, 150), (100, 150, 2, 50), | ||
" Pagination should give corrected page and correct count" | ||
) | ||
self.assertEqual( | ||
CTSCapitainsLocalResolver.pagination(5, 110, 150), (40, 50, 5, 10), | ||
" Pagination should use default limit (10) when getting too much " | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
<ti:textgroup xmlns:ti="http://chs.harvard.edu/xmlns/cts" projid="farsiLit:hafez" urn="urn:cts:farsiLit:hafez"> | ||
<ti:groupname xml:lang="eng">Hafez</ti:groupname> | ||
</ti:textgroup> |
Oops, something went wrong.