# Codificación de Caracteres

En este JNB se revisará el tema de Codificación de Caracteres.

In [1]:
print("Hello World!")

Hello World!


### Bibliotecas a utilizar

In [2]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful character encoding module
import charset_normalizer

# set seed for reproducibility
np.random.seed(0)

In [3]:
# start with a string
before = "This is the euro symbol: €"

# check to see what datatype it is
type(before)

str

In [4]:
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")

# check the type
type(after)

bytes

In [5]:
# take a look at what the bytes look like
after

b'This is the euro symbol: \xe2\x82\xac'

In [6]:
# convert it back to utf-8
print(after.decode("utf-8"))

This is the euro symbol: €


In [7]:
# try to decode our bytes with the ascii encoding
print(after.decode("ascii"))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 25: ordinal not in range(128)

In [8]:
# start with a string
before = "This is the euro symbol: €"

# encode it to a different encoding, replacing characters that raise errors
after = before.encode("ascii", errors = "replace")

# convert it back to utf-8
print(after.decode("ascii"))

# We've lost the original underlying byte string! It's been 
# replaced with the underlying byte string for the unknown character :(

This is the euro symbol: ?


In [9]:
# look at the first ten thousand bytes to guess the character encoding
# Ayuda a detectar el encoding de un archivo 
with open("./data/ks-projects-201612.csv", 'rb') as rawdata:
    result = charset_normalizer.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

{'encoding': 'windows-1250', 'language': 'English', 'confidence': 1.0}


In [10]:
# read in the file with the encoding detected by charset_normalizer
# Si eliminamos el encoding='Windows-1252' nos da error ya que lo leera en UTF-8
kickstarter_2016 = pd.read_csv("./data/ks-projects-201612.csv", encoding='Windows-1252')

# look at the first few lines
kickstarter_2016.head()

  kickstarter_2016 = pd.read_csv("./data/ks-projects-201612.csv", encoding='Windows-1252')


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


Bad pipe message: %s [b'+\xcfomQ\xcd\xe7%\xde5\xc8\xd0\xae\xd2a\x12"5 \xb3|\xec\xb8\xdf`|)\xb1\xed;d\xd0\x8b\x04y5y\xac\xb7\xbdO\x16\x9a\x9b\x9d\xc8\xeb\xee45\x13\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e']
Bad pipe message: %s [b'K2\x86i\xec\xab\x95\r>\xf8\x98Qc\xb0\xecx\x03\x83 \xb5_\xf9(\xa3\x03\x8b\xbf\x84\xceX\x01\xb3\x93\xd5\xa5\x88\xdc^\xfd\x10MJ\xc7Dl\x9c\xc1\xff\xe6\x042\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.', b'\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08']
Bad pipe message: %s [b'\x08\x08\t\x08\n\x08']
Bad pipe message: %s [b'\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pip