# Encoding specification and detection

## Read a file in Unicode UTF-8

In [None]:
fname = "./data/tweet_01.txt"
with open(fname, encoding="utf-8") as fp:
    cnt = 0
    for line in fp:
        cnt += 1
        if line.strip() !='':
            print("line {}: {}".format(cnt, line))
            

## Read the same file in the wrong encoding
The results are not all that pretty...

In [None]:
with open(fname, encoding="latin-1") as fp:
    cnt = 0
    for line in fp:
        cnt += 1
        if line.strip() !='':
            print("line {}: {}".format(cnt, line))

## Read Chinese in latin-1 encoding
The results are incomprehensible

In [None]:
fname = "./data/udhr_zh.txt"
with open(fname, encoding="latin-1") as fp:
    cnt = 0
    print(cnt)
    for line in fp:
        cnt += 1
        if line.strip() !='':
            print("line {}: {}".format(cnt, line))

## Read the same file in Unicode UTF-8
The results are catastrophic!

In [None]:
fname = "./data/udhr_zh.txt"
with open(fname, encoding="utf-8") as fp:
    cnt = 0
    print(cnt)
    for line in fp:
        cnt += 1
        if line.strip() !='':
            print("line {}: {}".format(cnt, line))

## Read the same file in the proper encoding
You get nice, readable traditional Chinese.

In [None]:
fname = "./data/udhr_zh.txt"
with open(fname, encoding="Big5") as fp:
    cnt = 0
    print(cnt)
    for line in fp:
        cnt += 1
        if line.strip() !='':
            print("line {}: {}".format(cnt, line))

## If you don't know the encoding, detect it!

pip install chardet

In [None]:
import chardet    
fname = "./data/udhr_zh.txt"
rawdata = open(fname, "rb").read()
print(rawdata)

### detect the encoding

In [None]:
result = chardet.detect(rawdata)
charenc = result['encoding']
print(charenc)

### decode the bytes with the detected encoding

In [None]:
decoded_data = rawdata.decode("Big5", "backslashreplace")
print(decoded_data)

### re-encode to a different encoding

In [None]:
utf8_bytes= decoded_data.encode('utf-8')
print(utf8_bytes)


In [None]:
result = chardet.detect(utf8_bytes)
charenc = result['encoding']
print(charenc)

## References

Python explanation of encodings and Unicode<br>
https://docs.python.org/3/howto/unicode.html

chardet usage examples<br>
https://chardet.readthedocs.io/en/latest/usage.html#example-using-the-detect-function
https://pypi.org/project/chardet/