Skip to content

Commit

Permalink
merged README from subdirectory
Browse files Browse the repository at this point in the history
  • Loading branch information
fileformat committed Jul 23, 2017
1 parent 3163883 commit a18077b
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 7 deletions.
7 changes: 7 additions & 0 deletions README.md
Expand Up @@ -30,3 +30,10 @@ Other Comparison Sites
* [Wikipedia](https://commons.wikimedia.org/wiki/Emoji)
* [Emoji.codes family tree](https://emoji.codes/family)

Unicode Emoji Data
------------------
* [Unicode Technical Standard 51](http://unicode.org/reports/tr51/tr51-12.html) - the standard
* [Data files](http://unicode.org/Public/emoji/5.0/)
* [Comparison table](http://unicode.org/emoji/charts/full-emoji-list.html) - with images
* [Charts](http://unicode.org/emoji/charts/index.html)

130 changes: 130 additions & 0 deletions bin/update-db.py
@@ -0,0 +1,130 @@
#!/usr/bin/python3
#
# parse emoji data into json
#

import argparse
import json
import os
import re
import sys

def to_hex(i):
if i > 0xFFFF:
return "%X" % i
else:
return "%04X" % i

emojis = dict()


line_pattern = re.compile("([A-F0-9 ]+);([-a-z ]+)# ([^ ]+) (.*)$")
filename = 'emoji-test.txt'
sys.stdout.write("INFO: processing file '%s'" % filename)
f = open(filename, mode='r', encoding='utf-8')
line_count = 0
emoji_count = 0
for rawline in f:
line_count += 1
if line_count % 100 == 0:
sys.stdout.write(".")

line = rawline.strip()

if len(line) == 0 or line[0] == '#':
continue

emoji_count += 1

matcher = line_pattern.search(line)
if not matcher:
sys.stdout.write("\nERROR: no match on line %d ('%s')" % (line_count, line))
continue

emoji = {}
emoji['codepoints'] = matcher.group(1).strip()
emoji['status'] = matcher.group(2).strip()
emoji['chars'] = matcher.group(3)
emoji['text'] = matcher.group(4).strip()

emojis[emoji['codepoints']] = emoji

f.close()

sys.stdout.write("\n")
sys.stdout.write("INFO: complete %d lines processed\n" % line_count)
sys.stdout.write("INFO: complete %d emoji processed\n" % emoji_count)


line_pattern = re.compile("([.A-F0-9 ]+);([-A-Za-z_ ]+)# +([^ ]+) (.*)$")
filename = 'emoji-data.txt'
sys.stdout.write("INFO: processing file '%s'" % filename)
f = open(filename, mode='r', encoding='utf-8')
line_count = 0
emoji_count = 0
new_count = 0
for rawline in f:
line_count += 1
if line_count % 100 == 0:
sys.stdout.write(".")

line = rawline.strip()

if len(line) == 0 or line[0] == '#':
continue


matcher = line_pattern.search(line)
if not matcher:
sys.stdout.write("\nERROR: no match on line %d ('%s')" % (line_count, line))
continue

str = matcher.group(1).strip()
if ".." not in str:
codepoints = [ str ]
else:
codepoints = []
split = str.split("..")
for loop in range(int(split[0], 16), int(split[0], 16)+1):
codepoints.append(to_hex(loop))

for codepoint in codepoints:
emoji_count += 1
if codepoint in emojis:
emoji = emojis[codepoint]
else:
new_count += 1
emoji = {}
emoji['codepoints'] = codepoint
emoji['chars'] = chr(int(codepoint, 16))
emojis[codepoint] = emoji

if 'property' not in emoji:
emoji['property'] = {}

emoji['property'][matcher.group(2).strip()] = True
emoji['version'] = matcher.group(3).strip()

f.close()

sys.stdout.write("\n")
sys.stdout.write("INFO: complete %d lines processed\n" % line_count)
sys.stdout.write("INFO: complete %d emoji processed\n" % emoji_count)
sys.stdout.write("INFO: complete %d emoji added\n" % new_count)

filename = "output.json"
sys.stdout.write("INFO: saving to file '%s'\n" % filename)
f = open(filename, mode='w', encoding='utf-8')
f.write(json.dumps(emojis, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')))
f.close()
sys.stdout.write("INFO: save complete: %d emoji\n" % len(emojis))

filename = "output.sql"
sys.stdout.write("INFO: saving to file '%s'\n" % filename)
f = open(filename, mode='w', encoding='utf-8')
#f.write(json.dumps(emojis, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')))
f.close()
sys.stdout.write("INFO: save complete: %d emoji\n" % len(emojis))



7 changes: 0 additions & 7 deletions unicode/README.md

This file was deleted.

0 comments on commit a18077b

Please sign in to comment.