Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Wikimedia sub spp #91

Closed
wants to merge 7 commits into from

1 participant

@hyanwong

These changes should allow us to properly categorise subspecies, varieties, and hybrids. It will allow much better parsing of flowers, in particular, but also important subspecies distinctions, such as Felis sylvestris catus, as was discussed recently on http://eol.org/communities/140/newsfeed

@hyanwong hyanwong closed this
@hyanwong

Made new pull request which obsoletes this one

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on May 21, 2014
  1. @hyanwong
  2. @hyanwong
  3. @hyanwong

    correct syntax

    hyanwong authored
Commits on May 22, 2014
  1. @hyanwong

    Deal with hybrid (notho) names, using the multiplication sign. Also d…

    hyanwong authored
    …eal with varieties, and prettify unit testing.
  2. @hyanwong
Commits on May 23, 2014
  1. @hyanwong

    Remove deprecated (and not strictly public domain "Copyrighted free u…

    hyanwong authored
    …se provided that" license. Also insert non-breaking space after hybrid multiplication sign, after discussion with Michael Frankis on EOL curators.
  2. @hyanwong

    Change unit test to reflect changes to hybrid syntax (insert non-brea…

    hyanwong authored
    …king space between x and name)
This page is out of date. Refresh to see the latest.
View
113 tests/unit/test_connector_wikimedia.php
@@ -42,13 +42,13 @@ function testGPSandFlickr()
|permission={{User:Flickr upload bot/upload|date=13:56, 21 September 2007 (UTC)|reviewer=Dongio}}
{{cc-by-2.0}}
}}
-{{Location dec|36.13274|-5.348888}}
+{{Location|36|7|57.8634|N|5|20|55.9962|E}}
[[Category:Macaca sylvanus]]";
$this->assertTrue($p->information()['author'] == 'Karyn Sig');
-$this->assertTrue($p->point()['latitude'] === "36.13274");
-
+$this->assertTrue(number_format($p->point()['latitude'], 5) === "36.13274");
+$this->assertTrue(number_format($p->point()['longitude'], 6) === "-5.348888");
// We should probably check here that $p->get_data_object_parameters()['agents'][0]->role == 'photographer', and
// $p->get_data_object_parameters()['agents'][0]->fullName == 'Karyn Sig' but that relies filling data_object_parameters with
// the results of an online API query using something like $pages = array($p); \WikimediaPage::process_pages_using_API($pages);
@@ -82,12 +82,7 @@ function testUnicodeInFilenamesAndDescriptions()
|Author = [[:pl:Wikipedysta:Aisog|Aisog]]
|Permission= / Creative Commons 2.5 Attribution
|other_versions =
-}}
-
-== {{int:license-header}} ==
-{{self2|GFDL|cc-by-2.5-pl|migration=redundant}}
-{{Self|GFDL|Cc-by-sa-3.0-migrated}}
-[[Category:Viola × wittrockiana]]</text>
+}}</text>
<sha1>o8m7xztilhi61i3yk2fgcvxx6xr2ej6</sha1>
<model>wikitext</model>
<format>text/x-wiki</format>
@@ -119,18 +114,7 @@ function testUnicodeInFilenamesAndDescriptions()
|permission=
|other_versions=
|other_fields=
-}}
-
-=={{int:license-header}}==
-{{self|cc-by-sa-3.0}}
-
-
-[[Category:Uploaded with UploadWizard]]
-[[Category:Malayalam Wikipedian's Upload]]
-[[Category:Pests on fruit and vegetables]]
-[[Category:Snails]]
-[[Category:Achatina fulica]]
-[[Category:Uploads by Ajay]]</text>
+}}</text>
<sha1>44v0txnedfh5gk011ki9syyvi8llcgd</sha1>
<model>wikitext</model>
<format>text/x-wiki</format>
@@ -141,13 +125,94 @@ function testUnicodeInFilenamesAndDescriptions()
$page1 = new \WikimediaPage($xml1);
$page2 = new \WikimediaPage($xml2);
+ //check capitalization doesn't mangle unicode
$this->assertTrue(\WikiParser::make_valid_pagetitle($page1->title) === $page1->title);
+ $this->assertTrue(\WikiParser::make_valid_pagetitle($page2->title) === $page2->title);
+
+ //check unicode makes it through to description field
$this->assertTrue(Functions::is_utf8($page1->description()));
$this->assertTrue(preg_match("/fiołek/u", $page1->description()));
- $this->assertTrue(\WikiParser::make_valid_pagetitle($page2->title) === $page2->title);
}
+ function testRecursiveIncludesPlusSubspeciesVarietiesAndHybrids()
+ {
+ $include1_xml = <<<XML
+ <page>
+ <title>Template:Orchidaceae (APG)</title>
+ <ns>10</ns>
+ <id>14618876</id>
+ <revision>
+ <id>105760723</id>
+ <parentid>78475379</parentid>
+ <timestamp>2013-09-29T16:24:22Z</timestamp>
+ <contributor>
+ <username>Liné1</username>
+ <id>80857</id>
+ </contributor>
+ <comment>| mustBeEmpty={{{classification|}}}{{{genus|}}}}}</comment>
+ <text xml:space="preserve">{{TaxonavigationIncluded2|
+classification=APG III|include=Angiosperms|Cladus|monocots|Ordo|Asparagales|Familia|Orchidaceae|rank={{{rank|}}}|
+categorizeSubtribesIn=Orchidaceae|&lt;!--categorizeSpeciesIn &amp; categorizeGeneraIn are subtily managed--&gt;categorizeTribesIn=Orchidaceae|
+mustBeEmpty={{{classification|}}}{{{genus|}}}}}</text>
+ <sha1>2r711pfbmrznvqwr6ntf4jlnlx8rua2</sha1>
+ <model>wikitext</model>
+ <format>text/x-wiki</format>
+ </revision>
+ </page>
+XML;
+ $include2_xml = <<<XML
+ <page>
+ <title>Template:Angiosperms</title>
+ <ns>10</ns>
+ <id>13862146</id>
+ <revision>
+ <id>123628829</id>
+ <parentid>78733139</parentid>
+ <timestamp>2014-05-10T12:13:25Z</timestamp>
+ <contributor>
+ <username>FrescoBot</username>
+ <id>1047183</id>
+ </contributor>
+ <minor />
+ <comment>Bot: [[User:FrescoBot/link syntax|link syntax]]</comment>
+ <text xml:space="preserve">{{TaxonavigationIncluded|Domain|Eukaryota|(unranked)|Archaeplastida|Regnum|Plantae|Cladus|angiosperms|rank={{{rank|}}}|
+ categorizeFamiliesIn=Plantae|documentTemplate={{{documentTemplate|yes}}}|documentTemplateWithClassification=APG III|categorizeTemplate={{{categorizeTemplate|yes}}} }}</text>
+ <sha1>kxneoukzggybsv69lat0npg8aexxwx9</sha1>
+ <model>wikitext</model>
+ <format>text/x-wiki</format>
+ </revision>
+ </page>
+XML;
+
+ $p1 = new \WikimediaPage('<xml/>');
+ $p1->text = "{{Taxonavigation|include=Orchidaceae (APG)|Subfamilia|Orchidoideae|Tribus|Orchideae|Subtribus|Orchidinae|
+Nothospecies|Anacamptis × gennarii|
+Nothosubspecies|Anacamptis × gennarii ssp bornemanniae|
+authority=(Asch.) H.Kretzschmar, Eccarius & H.Dietr. (2007)}}";
+ //this is a fake example, not many wikimedia entries are formatted like this, but we should be able to cope with them
+ $p2 = new \WikimediaPage('<xml/>');
+ $p2->text = "{{Taxonavigation|include=Orchidaceae (APG)|Subfamilia|Orchidoideae|Tribus|Orchideae|Subtribus|Orchidinae|
+Nothogenus|× Anacamptis|
+Nothospecies|gennarii|
+Nothovarietas|dummy|}}";
+
+ $dummy_harvester= new WikimediaHarvester(null);
+ $dummy_harvester->locate_taxonomic_pages($include1_xml);
+ $dummy_harvester->locate_taxonomic_pages($include2_xml);
+ $taxonomy1 = $p1->taxonomy($dummy_harvester->taxonav_includes);
+ $taxonomy2 = $p2->taxonomy($dummy_harvester->taxonav_includes);
+
+ //test whether recursive includes have managed to find the kingdom name
+ $this->assertTrue($taxonomy1->asEoLtaxonObject()["kingdom"] == "Plantae");
+
+ //test if we have managed to reconstruct the genus name from the species name
+ $this->assertTrue($taxonomy1->asEoLtaxonObject()["genus"] == "Anacamptis");
+
+ //test whether the scientific name is properly formed, e.g. ssp replaced with subsp.
+ $this->assertTrue($taxonomy1->scientificName() == html_entity_decode("Anacamptis &times;&nbsp;gennarii subsp. bornemanniae (Asch.) H.Kretzschmar, Eccarius & H.Dietr. (2007)"));
+ $this->assertTrue($taxonomy2->scientificName() == html_entity_decode("&times;&nbsp;Anacamptis gennarii var. dummy"));
+ }
function testTaxonomyConflict()
{
@@ -280,8 +345,7 @@ classification=IOC|
</page>
XML;
- $dummy_resource = null;
- $dummy_harvester = new WikimediaHarvester($dummy_resource);
+ $dummy_harvester = new WikimediaHarvester(null);
$dummy_harvester->locate_taxonomic_pages($include1_xml);
$dummy_harvester->locate_taxonomic_pages($include2_xml);
@@ -307,7 +371,6 @@ classification=IOC|
foreach($names as $name) $this->assertFalse($name === "Passeriformes");
}
-
}
?>
View
116 vendor/wikipedia/WikimediaPage.php
@@ -325,8 +325,8 @@ public function best_license($potential_licenses, $is_wikitext = true)
$identified_licenses = array();
foreach($potential_licenses as $potential_license)
{
- // PD-USGov-CIA-WF
- if(preg_match("/^(PD|Public domain.*|CC-PD|usaid|nih|noaa|CopyrightedFreeUse|Copyrighted Free Use)($| |-)/mui", $potential_license))
+ // catch e.g. PD-USGov-CIA-WF, etc and Copyrighted_free_use (but *not* "Copyrighted free use provided that")
+ if(preg_match("/^(PD|Public[ _]domain.*|CC-PD|usaid|nih|noaa|CopyrightedFreeUse|Copyrighted[ _]Free[ _]Use(?![ _]provided[ _]that))($| |-)/mui", $potential_license))
{
$identified_licenses[] = array(
'license' => 'public domain',
@@ -994,9 +994,11 @@ public static function check_page_titles($array_of_titles)
class TaxonomyParameters
{
- // listed as most precise to least precise
- // 'species' is not part of the EoL output, but may be used to construct scientificName later
+ // Wiki names from https://commons.wikimedia.org/wiki/Template:Taxonavigation, listed here from most precise to least precise
+ // 'subspecies' and 'species' are not part of the EoL output, but may be used to construct scientificName later
public static $wiki_to_standard = array(
+ "Varietas" => "variety",
+ "Subspecies"=> "subspecies",
"Species" => "species",
"Genus" => "genus",
"Familia" => "family",
@@ -1004,6 +1006,7 @@ class TaxonomyParameters
"Classis" => "class",
"Phylum" => "phylum",
"Regnum" => "kingdom");
+ public static $extra_params = array("Authority" => "authority");
private $taxon_params = array();
private $page_timestamp;
public $authority;
@@ -1023,12 +1026,18 @@ public function get($standard_rank)
public function add_wiki_info($wiki_rank, $wikitext)
{
- $wiki_rank = WikiParser::mb_ucfirst(WikiParser::mb_trim($wiki_rank));
+ $wiki_rank = WikiParser::mb_trim($wiki_rank);
+ //remove notho- designation, see http://ibot.sav.sk/icbn/frameset/0071AppendixINoHa003.htm
+ $wiki_rank = preg_replace("/^notho/i", "", $wiki_rank);
+ $wiki_rank = WikiParser::mb_ucfirst($wiki_rank);
$text = strip_tags(WikiParser::strip_syntax($wikitext));
- if($wiki_rank === 'Authority') return $this->add_info('authority', $text);
- if(empty(self::$wiki_to_standard[$wiki_rank])) return "";
- return $this->add_info(self::$wiki_to_standard[$wiki_rank], $text);
+ //translate all listed in $wiki_to_standard, plus the Authority field
+ $allowed_params = self::$wiki_to_standard + self::$extra_params;
+ if (array_key_exists($wiki_rank, $allowed_params))
+ {
+ return $this->add_info($allowed_params[$wiki_rank], $text);
+ }
}
public function add_info($rank, $text)
@@ -1056,9 +1065,16 @@ public function add_info($rank, $text)
if(preg_match("/^fossil (.*)$/i", $name, $arr)) $name = WikiParser::mb_ucfirst(trim($arr[1]));
// don't set anything if the string is empty
if($name === '') return $return_message;
+
+ /* Make hybrid names a single word, replacing space after the × sign with a non-breaking space
+ Treat X, x or × as hybrid indicators if they are at the start or preceeded by a space, e.g. "X Cleistoza" becomes
+ ×Cleistoza and Salix × pendulina becomes Salix ×pendulina. This also helps us delimit species and genera names */
+ static $multiply_sign_and_nonbreaking_space = "× "; //make sure the "space" in this string is actually a NBSP
+ $name = preg_replace("/(?<=^| )[×x] +/iu", $multiply_sign_and_nonbreaking_space, $name);
+
if($rank === 'genus')
{
- if(preg_match("/^([A-Z][^ ]+) [a-z]/", $name, $arr))
+ if(preg_match("/^([A-Z×][^ ]+) [a-z×]/u", $name, $arr))
{
// careful with e.g. Category:Rosa_laxa which has Genus = 'Rosa species'
$return_message = "Multi-word genus ($name) getting shortened to ". $arr[1];
@@ -1071,48 +1087,98 @@ public function add_info($rank, $text)
$name = $arr[1];
}
// species was set with just the epithet
- if(!empty($this->taxon_params['species']) && !preg_match("/\s/", $this->taxon_params['species']))
+ if(!empty($this->taxon_params['species']) && !preg_match("/ /", $this->taxon_params['species']))
{
$this->taxon_params['species'] = $name . ' ' . $this->taxon_params['species'];
}
}
- if($rank === 'species')
+ if(($rank === 'species') || ($rank === 'subspecies') || ($rank === 'variety'))
{
- /* TODO - caution here with virus species names, which can contain multiple words and capitals, something like
+ /* TODO - caution here with virus species names, which can contain multiple words and capitals. We need something like
if ($this->taxon_params['domain'] != "Viruses") ...
- only we don't currently store the domain name
- */
- // multiple words in species (this is the norm)
+ only we don't currently store the domain name, so we can't check. This is only a problem if the Species field contains a
+ multi-word epithet that happens to start with a capital letter, and we haven't yet defined a genus (pretty rare), in which case we will
+ assume a genus name from the first word of the epithet, or if Species field contains a single-word epithet with caps, in which case
+ the epithet will be ignored and a warning given (so e.g. we currently miss https://commons.wikimedia.org/wiki/Category:Theilovirus) */
+
+ // multiple words in (sub)species (this is the norm)
if(preg_match("/ /", $name))
{
- if(empty($this->taxon_params['genus']) && preg_match("/^([A-Z][^ ]+) [a-z]/", $name, $arr))
+ if(empty($this->taxon_params['genus']) && preg_match("/^([A-Z×][^ ]+) [a-z×]/u", $name, $arr))
{
$this->taxon_params['genus'] = $arr[1];
- if($GLOBALS['ENV_DEBUG']) $return_message = "Genus ".$this->taxon_params['genus']." initially set from species name ('$name'). ";
+ if($GLOBALS['ENV_DEBUG']) $return_message = "Genus ".$this->taxon_params['genus']." initially set from $rank name ('$name'). ";
+ }
+ if ($rank === 'subspecies')
+ {
+ //sometimes people forget to put the dot after subsp. or have ssp instead. Standardise these
+ $name = preg_replace("/ (subsp|ssp\.?) /i", " subsp. ", $name);
+ }
+ if ($rank === 'variety')
+ {
+ //sometimes people forget to put the dot after subsp. or var. Standardise these
+ $name = preg_replace("/ (subsp|ssp\.?) /i", " subsp. ", $name);
+ $name = preg_replace("/ var /i", " var. ", $name);
+ //TODO - we don't cope with multi-word varieties which are epithets, most likely seen incorrectly in
+ //cultivars, e.g. Varietas='my variety name'.
}
}
- // single word in 'species' - this could be an epithet
+ // single word in 'species' or 'subspecies' - this could be an epithet
else
{
if(mb_strtolower($name, "UTF-8") != $name)
{
- $return_message = "Single-word species ('$name') has CaPs: ignoring this part of the classification. ";
+ $return_message = "Single-word $rank ('$name') has CaPs: ignoring this part of the classification. ";
return $return_message;
}
- if(!empty($this->taxon_params['genus']))
+ if ($rank === 'species')
{
+ if (empty($this->taxon_params['genus']))
+ {
+ $return_message .= "Single word specific name, but no genus given: ignoring the species information. ";
+ return $return_message;
+ }
$name = $this->taxon_params['genus'] . ' ' . $name;
}
+ if ($rank === 'subspecies')
+ {
+ if (empty($this->taxon_params['species']))
+ {
+ $return_message .= "Single word subspecific name, but no species given: ignoring the subspecific information. ";
+ return $return_message;
+ }
+ $name = $this->taxon_params['species'] . ' ' . $name;
+ }
+ if ($rank === 'variety')
+ {
+ //this can exist alongside a subspecies, e.g. Category:Brassica rapa subsp. nipposinica var. perviridis
+ //only in plants, where we must insert a 'var.'
+ if (empty($this->taxon_params['subspecies']))
+ {
+ if (empty($this->taxon_params['species']))
+ {
+ $return_message .= "Single word variety name, but neither species nor subspecies given: ignoring the variety information. ";
+ return $return_message;
+ }
+ $name = $this->taxon_params['species'] . ' var. ' . $name;
+ } else {
+ $name = $this->taxon_params['subspecies'] . ' var. ' . $name;
+ }
+ }
}
}else
{
- while(preg_match("/( \(.*?\))/", $name, $arr)) $name = str_replace($arr[1], '', $name);
+ /* By wikimedia commons convention, taxon names like "Zeus", "Viola", or "Turbo" that already have unrelated wikimedia pages
+ are give gallery and category names like "Zeus (fish)", "Viola (Violaceae)" and "Turbo (genus)" which appear as Taxonavigation names.
+ So we should remove any terminal part of the name that is bracketed */
+ $name = preg_replace("/ \(.*?\)/", "", $name);
+
if(preg_match("/[ \(\)]/", $name))
{
// We make an exception here for classes 'Gamma Proteobacteria', 'Alpha Proteobacteria' etc.
if(!preg_match("/^\w+ proteobacteria$/i", $name))
{
- $return_message .= "A classification level above that of species ($rank = '$name') has issues with brackets or spaces. ";
+ $return_message .= "A classification level above that of species ($rank = '$name') has brackets or spaces: ignoring it. ";
return $return_message;
}
}
@@ -1150,8 +1216,10 @@ public function scientificName()
public function asEoLtaxonObject()
{
// calculate what EoL needs from the levels that we know about
- static $spp = array('species' => null);
- $array_to_return = array_diff_key($this->taxon_params, $spp); // "species" level detail in EoL is contained in scientificName
+
+ static $not_returned = array('species' => null, 'subspecies' => null, 'variety' => null);
+ // species and lower level detail in EoL is contained in scientificName, so we don't return these fields
+ $array_to_return = array_diff_key($this->taxon_params, $not_returned);
$array_to_return['scientificName'] = $this->scientificName();
$array_to_return['dataObjects'] = array();
return $array_to_return;
Something went wrong with that request. Please try again.