# Intro to Binary Data in Python

### The `bytes` type

In [1]:
print(bytes)

<class 'bytes'>


In [2]:
x = bytes()
dir(x)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'center',
 'count',
 'decode',
 'endswith',
 'expandtabs',
 'find',
 'fromhex',
 'hex',
 'index',
 'isalnum',
 'isalpha',
 'isdigit',
 'islower',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [3]:
# Literals can be defined with any mxiture of ASCII characters and hexidecimal escape sequences
x = b'hello world'
y = b'\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64'
z = bytes([104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100])
print(x)
print(y)
print(z)
print(x == y == z)

b'hello world'
b'hello world'
b'hello world'
True


In [4]:
# Interating over a bytes object yields each byte's integer value
print([b for b in x])

[104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100]


## Converting data into useful python objects

### Text data to and from bytes

In [5]:
bytes_msg = b'hello world\xe2\x98\xba'
string_msg = bytes_msg.decode('utf8')
print(string_msg)

print(string_msg.encode('utf8'))
print(string_msg.encode('utf16'))

hello world☺
b'hello world\xe2\x98\xba'
b'\xff\xfeh\x00e\x00l\x00l\x00o\x00 \x00w\x00o\x00r\x00l\x00d\x00:&'


### Loading binary data from a file

In [6]:
with open('image.bmp', 'rb') as bmp_file:
    bmp_data = bmp_file.read()
print(bmp_data[:500])

b'BMz{\x0c\x00\x00\x00\x00\x00z\x00\x00\x00l\x00\x00\x00\x80\x02\x00\x00\xaa\x01\x00\x00\x01\x00\x18\x00\x00\x00\x00\x00\x00{\x0c\x00\x13\x0b\x00\x00\x13\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00BGRs\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x07\x05\x02\x05\x03\x04\x03\x05\x04\x03\x05\x06\x04\x04\x06\x06\x06\x06\x06\x06\x00\x01\x05\x07\t\x13\x84\x88\x9a\xbf\xc1\xdf\xbb\xbd\xdf\xbc\xbd\xe3\xba\xbc\xdf\xbc\xbf\xde\xba\xc0\xdd\xb9\xbe\xdd\xb8\xc0\xde\xb9\xc1\xdf\xb9\xc1\xdf\xbb\xc0\xdf\xbb\xc0\xdf\xba\xbf\xde\xba\xbf\xde\xbc\xc0\xdd\xbe\xc2\xdf\xbe\xc2\xdf\xbd\xc1\xde\xbd\xc1\xde\xbe\xc2\xdf\xbf\xc3\xe0\xbf\xc3\xe0\xc1\xc4\xe3\xc0\xc3\xe2\xc0\xc4\xe1\xc1\xc5\xe2\xc1\xc5\xe2\xc1\xc5\xe2\xc1\xc5\xe1\xc0\xc4\xe0\xc0\xc3\xdf\xc0\xc3\xdf\xc1\xc5\xde\xc2\xc6\xdf\xc1\xc5\xde\xc1\

### Interpreting structured binary file data
#### BMP file format

*Source: http://www.ece.ualberta.ca/~elliott/ee552/studentAppNotes/2003_w/misc/bmp_file_format/bmp_file_format.htm*

| Section         | Size (# bytes) | Description |
|-----------------|----------------|-------------|
| File Header     | 2              | 'BM' file signature |
| -               | 4              | File size in bytes (32 bit integer) |
| -               | 4              | Reserved / unused |
| -               | 4              | Image data offset (32 bit integer) |
| Image Info      | 4              | Size of this header (32 bit integer, ==40) |
| -               | 4              | Image width (32 bit integer) |
| -               | 4              | Image height (32 bit integer) |
| -               | 2              | # color planes (16 bit integer) |
| -               | 2              | Bits per pixel (16 bit integer, 1, 4, 8, 16, or 24) |
| -               | 4              | Compression type (32 bit integer, 0, 1, or 2) |
| -               | 4              | Compressed image size (32 bit integer, ==0) |
| -               | 4              | X pixels per meter (32 bit integer) |
| -               | 4              | Y pizels per meter (32 bit integer) |
| -               | 4              | # Colors used (ignore) |
| -               | 4              | # Important colors (ignore) |
| Color Table     | ?              | - |
| Pixel Data      | ?              | - |


In [7]:
import struct

header_data = struct.unpack('<2sIIIIIIHHIIIIII', bmp_data[:54])
# < tells unpact to use little-endianness when interpreting integers (byte order integers are assumed to be stored in)
# 2s specifies 2 consecutive bytes
# I specifies a standard width unsigned integer (a 32 bit always positive integer)
# H specifies an unsigned short integer (a 16 bit always positive integer)

# For other format options see: https://docs.python.org/3/library/struct.html#format-strings


(
    file_sig,
    file_size,
    reserved,
    pixel_data_offset,
    info_header_size,
    width,
    height,
    num_planes,
    bits_per_pixel,
    compression_type,
    compressed_size,
    horizontal_resolution,
    vertical_resolution,
    colors_used,
    important_colors,
) = header_data


In [8]:
assert file_sig == b'BM'  # make sure this file says it's a bitmap image
print(type(width))

<class 'int'>


In [9]:
print(f'''
File Size:    {file_size} bytes
Image Size:   {width}x{height}
BPP:          {bits_per_pixel}
Compression:  {compression_type}
Resolution:   {horizontal_resolution}x{vertical_resolution}
''')


File Size:    818042 bytes
Image Size:   640x426
BPP:          24
Compression:  0
Resolution:   2835x2835



In [10]:
print(pixel_data_offset)
pixel_data = bmp_data[pixel_data_offset:]

122


In [11]:
# invert colors
inverted_pixels = bytes([~x & 0xFF for x in pixel_data])

In [12]:
# re-pack the header data
total_header_size = struct.calcsize('<2sIIIIIIHHIIIIII')

header_bytes = struct.pack(
    '<2sIIIIIIHHIIIIII', 
    b'BM',  # file signature
    len(inverted_pixels) + total_header_size,  # filesize
    0,  # reserved
    total_header_size,  # pixel offset
    40,  # info header size
    width,
    height,
    1,  # num planes
    24,  # bits per pixel
    0,  # compression type
    0,  # compressed size
    horizontal_resolution,
    vertical_resolution,
    0,  # colors used
    0,  # important colors
)
print(header_bytes)

b'BM6{\x0c\x00\x00\x00\x00\x006\x00\x00\x00(\x00\x00\x00\x80\x02\x00\x00\xaa\x01\x00\x00\x01\x00\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13\x0b\x00\x00\x13\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'


In [13]:
with open('inverted.bmp', 'wb') as bmp_file:
    bmp_file.write(header_bytes)
    bmp_file.write(inverted_pixels)

## Other encodings

In [14]:
import binascii

binary_string = b'hello world'
binascii.hexlify(binary_string)

b'68656c6c6f20776f726c64'

In [15]:
import base64

base64.b64encode(binary_string)

b'aGVsbG8gd29ybGQ='