From 860902337971f8e0c92dc2ffc567bbe33e53a7e6 Mon Sep 17 00:00:00 2001 From: Shakil Thakur Date: Mon, 14 Nov 2016 21:31:14 -0600 Subject: [PATCH] initial release and updated readme --- README.md | 5 +++-- spec/unidecode_spec.lua | 3 ++- ...m-0.rockspec => unicorndecode-1.0.0-1.rockspec | 8 ++++---- unicorndecode.lua | 15 +++++++++------ 4 files changed, 18 insertions(+), 13 deletions(-) rename unicorndecode-scm-0.rockspec => unicorndecode-1.0.0-1.rockspec (56%) diff --git a/README.md b/README.md index 644f7af..9ac4eca 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,11 @@ [![Build Status](https://travis-ci.org/FourierTransformer/unicorndecode.svg?branch=master)](https://travis-ci.org/FourierTransformer/unicorndecode) [![Coverage Status](https://coveralls.io/repos/github/FourierTransformer/unicorndecode/badge.svg?branch=master)](https://coveralls.io/github/FourierTransformer/unicorndecode?branch=master) -unicorndecode is a port of the [Text::Unicode](http://search.cpan.org/perldoc/Text::Unidecode) perl library into lua. It attempts to take unicode characters and represent them in ASCII. It does this by removing accents or transliterating other languages into Roman characters - which can occasionally work well and sometimes not so well. I mainly use it for removing accents on Roman characters. +unicorndecode is a port of the [Text::Unicode](http://search.cpan.org/perldoc/Text::Unidecode) perl library into lua. It attempts to take unicode characters and represent them in ASCII. It does this by removing accents or transliterating other languages into Roman characters - which can occasionally work well and sometimes not so well! ## Installing unicorndecode is installed via luarocks: +It works out of the box with Lua 5.2/5.3, LuaJIT 2.0/2.1 and will work with Lua 5.1 if [luabitop](https://luarocks.org/modules/luarocks/luabitop) is installed. ``` luarocks install unicorndecode @@ -29,4 +30,4 @@ In this case, `decodedString` is `Bronte` and `isUTF8` is `true`. - The `unidecode_data.lua` table is created from the JSON file generated in [UnicodeConverter](https://github.com/FourierTransformer/UnidecodeConverter) passed through `misc_scripts/convert_json_to_lua_table.lua`. ## License -This library is released under the [MIT License](LICENSE) \ No newline at end of file +This library is released under the [MIT License](LICENSE) diff --git a/spec/unidecode_spec.lua b/spec/unidecode_spec.lua index 1b8884a..b506b9e 100644 --- a/spec/unidecode_spec.lua +++ b/spec/unidecode_spec.lua @@ -4,7 +4,8 @@ local unidecodeTest = { {"Brontë", "Bronte"}, {"Herp", "Herp"}, {"北亰", "Bei Jing"}, - {"læti", "laeti"} + {"læti", "laeti"}, + {"😂", "[?]"} } describe("unicorndecode", function() diff --git a/unicorndecode-scm-0.rockspec b/unicorndecode-1.0.0-1.rockspec similarity index 56% rename from unicorndecode-scm-0.rockspec rename to unicorndecode-1.0.0-1.rockspec index 32815ce..37554a2 100644 --- a/unicorndecode-scm-0.rockspec +++ b/unicorndecode-1.0.0-1.rockspec @@ -1,14 +1,14 @@ package = "unicorndecode" -version = "scm-0" +version = "1.0.0" source = { - url = "git://github.com/FourierTransformer/unicornDecode.git", + url = "git://github.com/FourierTransformer/unicorndecode.git", } description = { summary = "Unidecode for Lua", detailed = [[ - This is a port of unidecode written in Lua. It allows you to convert UTF-8 characters into similar-looking ASCII. + This is a port of perl's Text::Unidecode written in Lua. It allows you to convert UTF-8 characters into similar-looking ASCII characters. ]], - homepage = "https://github.com/FourierTransformer/unicornDecode", + homepage = "https://github.com/FourierTransformer/unicorndecode", maintainer = "Shakil Thakur ", license = "MIT" } diff --git a/unicorndecode.lua b/unicorndecode.lua index 1274f6a..591f187 100644 --- a/unicorndecode.lua +++ b/unicorndecode.lua @@ -1,5 +1,5 @@ local unicorndecode = { - _VERSION = 'unicorndecode scm-0', + _VERSION = 'unicorndecode 1.0.0', _DESCRIPTION = 'Unidecode for Lua', _URL = 'https://github.com/FourierTransformer/unicorndecode', _LICENSE = [[ @@ -55,11 +55,14 @@ else -- determines how many additional bytes are needed to parse the unicode char -- NOTE: assumes the UTF-8 input is clean - which may get dangerous. local function additionalBytes(val) - if val >= 252 then - return 5, 252 - elseif val >= 248 then - return 4, 248 - elseif val >= 240 then + -- these don't really exist yet... + -- and are definitely not in the data tables... + -- if val >= 252 then + -- return 5, 252 + -- elseif val >= 248 then + -- return 4, 248 + -- elseif val >= 240 then + if val >= 240 then return 3, 240 elseif val >= 224 then return 2, 224