In [1]:
"""
TF Text主要采用了unicode的编码方式进行运作
"""
import tensorflow as tf
import numpy as np

In [2]:
tf.constant(u"Thanks 😊")

2023-03-28 16:02:49.636905: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2023-03-28 16:02:49.683992: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300000000 Hz
2023-03-28 16:02:49.695024: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x560dd15549d0 executing computations on platform Host. Devices:
2023-03-28 16:02:49.695082: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version


<tf.Tensor: id=0, shape=(), dtype=string, numpy=b'Thanks \xf0\x9f\x98\x8a'>

In [3]:
# tf.string treats byte strings as atomic units, 
# which enables byte strings of varying lengths
# so the string length is not included in the tensor dimensions.
tf.constant([u'You are',u'welcome!']).shape

TensorShape([2])

In [9]:
"""
Two ways to represent a Unicode string in Tensorflow:
1.string scalar - sequence of code points is encoded using a known
2.int32 vector - each posistion contains a single point
"""
# Unicode string, represented as a UTF-8 encoded string scalar.
text_utf8 = tf.constant(u"语言处理")
text_utf8

<tf.Tensor: id=19, shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [5]:
# Unicode string, represented as a UTF-16-BE encoded string scalar.
text_utf16be = tf.constant(u"语言处理".encode("UTF-16-BE"))
text_utf16be

<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [6]:
# Unicode string, represented as a vector of Unicode code points.
text_chars = tf.constant([ord(char) for char in u"语言处理"])
text_chars

<tf.Tensor: id=4, shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

In [10]:
"""
TensorFlow provides operations to convert between these different representations:
1.tf.strings.unicode_decode: Converts an encoded string scalar 
                            to a vector of code points.
2.tf.strings.unicode_encode: Converts a vector of code points 
                            to an encoded string scalar.
tf.strings.unicode_transcode: Converts an encoded string scalar 
                            to a different encoding.
"""
# 将UTF-8转为code points vector
tf.strings.unicode_decode(text_utf8, input_encoding='UTF-8')

<tf.Tensor: id=23, shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

In [11]:
# 将code porints vector转化为UTF-8
tf.strings.unicode_encode(text_chars, output_encoding='UTF-8')

<tf.Tensor: id=33, shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [13]:
# 将UTF-8转为UTF-116-BE
tf.strings.unicode_transcode(text_utf8,
                            input_encoding='UTF-8',
                            output_encoding='UTF-16-BE')

<tf.Tensor: id=34, shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [19]:
# A batch of Unicode strings, each represented as a UTF8-encoded string.
batch_utf8 = [s.encode('UTF-8') for s in
              [u'你好', u'快爬', u'我来到你的城市，走过你来时的路', u'😊']]

# batch解码，返回tf.RaggedTensor，
# where the innermost dimension length varies depending on the number of characters in each string.
batch_chars_ragged = tf.strings.unicode_decode(batch_utf8,
                                               input_encoding='UTF-8')
for sentence_chars in batch_chars_ragged.to_list():
    print(sentence_chars)

[20320, 22909]
[24555, 29228]
[25105, 26469, 21040, 20320, 30340, 22478, 24066, 65292, 36208, 36807, 20320, 26469, 26102, 30340, 36335]
[128522]


In [22]:
# 将RaggedTensor转为tensor,空白值填为default_value
batch_chars_padded = batch_chars_ragged.to_tensor(default_value=-1)
batch_chars_padded.numpy()

array([[ 20320,  22909,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1],
       [ 24555,  29228,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1],
       [ 25105,  26469,  21040,  20320,  30340,  22478,  24066,  65292,
         36208,  36807,  20320,  26469,  26102,  30340,  36335],
       [128522,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1]],
      dtype=int32)

In [38]:
# 将RaggedTensor转为SparseTensor
batch_chars_sparse = batch_chars_ragged.to_sparse()
nrows, ncols = batch_chars_sparse.dense_shape.numpy()

In [26]:
elements=[['-' for i in range(ncols)] for j in range(nrows)]
elements

[['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
 ['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
 ['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
 ['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-']]

In [34]:
for (row,col), value in zip(batch_chars_sparse.indices.numpy(),batch_chars_sparse.values.numpy()):
    elements[row][col] = str(value)

max_width = max(len(value) for row in elements for value in row)
print('[%s]' % '\n '.join(
    '[%s]' % ', '.join(value.rjust(max_width) for value in row)
    for row in elements))

[[ 20320,  22909,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -]
 [ 24555,  29228,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -]
 [ 25105,  26469,  21040,  20320,  30340,  22478,  24066,  65292,  36208,  36807,  20320,  26469,  26102,  30340,  36335]
 [128522,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -,      -]]


In [35]:
# when encoding multiple strings with the same lengths, use a tf.Tensor as input
tf.strings.unicode_encode([[99,97,116],[100,111,103],[99,111,119]],
                         output_encoding='UTF-8')

<tf.Tensor: id=226, shape=(3,), dtype=string, numpy=array([b'cat', b'dog', b'cow'], dtype=object)>

In [36]:
# when encoding multiple strings with varying length, use a tf.RaggedTensor as the input
tf.strings.unicode_encode(batch_chars_ragged,
                         output_encoding='UTF-8')

<tf.Tensor: id=227, shape=(4,), dtype=string, numpy=
array([b'\xe4\xbd\xa0\xe5\xa5\xbd', b'\xe5\xbf\xab\xe7\x88\xac',
       b'\xe6\x88\x91\xe6\x9d\xa5\xe5\x88\xb0\xe4\xbd\xa0\xe7\x9a\x84\xe5\x9f\x8e\xe5\xb8\x82\xef\xbc\x8c\xe8\xb5\xb0\xe8\xbf\x87\xe4\xbd\xa0\xe6\x9d\xa5\xe6\x97\xb6\xe7\x9a\x84\xe8\xb7\xaf',
       b'\xf0\x9f\x98\x8a'], dtype=object)>

In [66]:
# tensor in padded or sparse format:
# 1.convert it to tf.RaggedTensor
# 2.call tf.strings.unicode_encode
tf.strings.unicode_encode(
    tf.RaggedTensor.from_sparse(batch_chars_sparse),
    output_encoding='UTF-8')

<tf.Tensor: id=808, shape=(4,), dtype=string, numpy=
array([b'\xe4\xbd\xa0\xe5\xa5\xbd', b'\xe5\xbf\xab\xe7\x88\xac',
       b'\xe6\x88\x91\xe6\x9d\xa5\xe5\x88\xb0\xe4\xbd\xa0\xe7\x9a\x84\xe5\x9f\x8e\xe5\xb8\x82\xef\xbc\x8c\xe8\xb5\xb0\xe8\xbf\x87\xe4\xbd\xa0\xe6\x9d\xa5\xe6\x97\xb6\xe7\x9a\x84\xe8\xb7\xaf',
       b'\xf0\x9f\x98\x8a'], dtype=object)>

In [67]:
# same as sparse format
tf.strings.unicode_encode(
    tf.RaggedTensor.from_tensor(batch_chars_padded, padding=-1),
    output_encoding='UTF-8')

<tf.Tensor: id=880, shape=(4,), dtype=string, numpy=
array([b'\xe4\xbd\xa0\xe5\xa5\xbd', b'\xe5\xbf\xab\xe7\x88\xac',
       b'\xe6\x88\x91\xe6\x9d\xa5\xe5\x88\xb0\xe4\xbd\xa0\xe7\x9a\x84\xe5\x9f\x8e\xe5\xb8\x82\xef\xbc\x8c\xe8\xb5\xb0\xe8\xbf\x87\xe4\xbd\xa0\xe6\x9d\xa5\xe6\x97\xb6\xe7\x9a\x84\xe8\xb7\xaf',
       b'\xf0\x9f\x98\x8a'], dtype=object)>

In [68]:
thanks = u'Thanks 😊'.encode('UTF-8')
# 获取二进制数长度？
num_bytes = tf.strings.length(thanks).numpy()
# 获取字符数
num_chars = tf.strings.length(thanks, unit='UTF8_CHAR').numpy()
print('{} bytes; {} UTF-8 characters'.format(num_bytes, num_chars))

11 bytes; 8 UTF-8 characters


In [70]:
"""
Character substrings:
tf.strings.substr，可传入unit决定pos & len
"""
tf.strings.substr(thanks, pos=7,len=1).numpy()

b'\xf0'

In [69]:
tf.strings.substr(thanks, pos=7, len=1,unit='UTF8_CHAR').numpy()

b'\xf0\x9f\x98\x8a'

In [71]:
"""
Split Unicode strings: 
operation splits unicode strings into substrings of individual characters
"""
tf.strings.unicode_split(thanks, 'UTF-8').numpy()

array([b'T', b'h', b'a', b'n', b'k', b's', b' ', b'\xf0\x9f\x98\x8a'],
      dtype=object)

In [73]:
"""
Unicode scripts:
每个unicode都有相应的script,来表示语种(International Components for Unicode)
"""
words = [u'好',u'Б']
tf.strings.unicode_script(tf.constant([ord(char) for char in words]))

<tf.Tensor: id=929, shape=(2,), dtype=int32, numpy=array([17,  8], dtype=int32)>

In [74]:
# 可直接作用于RanggedTensor或Tensor上
print(tf.strings.unicode_script(batch_chars_ragged))

<tf.RaggedTensor [[17, 17], [17, 17], [17, 17, 17, 17, 17, 17, 17, 0, 17, 17, 17, 17, 17, 17, 17], [0]]>
