# 数据编码和处理

## 读写csv数据

In [1]:
import csv

headers = ['Symbol','Price','Date','Time','Change','Volume']
rows = [('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800),
('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500),
('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000),
]

with open('data/stocks.csv','w', newline='') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)

In [2]:
headers = ['Symbol', 'Price', 'Date', 'Time', 'Change', 'Volume']
rows = [{'Symbol':'AA', 'Price':39.48, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.18, 'Volume':181800},
{'Symbol':'AIG', 'Price': 71.38, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.15, 'Volume': 195500},
{'Symbol':'AXP', 'Price': 62.58, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.46, 'Volume': 935000},
]
with open('data/stocks.csv','w', newline='') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    f_csv.writerows(rows)

In [3]:
import csv
with open('data/stocks.csv') as f:
    f_csv = csv.reader(f)
    header = next(f_csv)
    for row in f_csv:
        print(row)

['AA', '39.48', '6/11/2007', '9:36am', '-0.18', '181800']
['AIG', '71.38', '6/11/2007', '9:36am', '-0.15', '195500']
['AXP', '62.58', '6/11/2007', '9:36am', '-0.46', '935000']


In [4]:
from collections import namedtuple

with open('data/stocks.csv') as f:
    f_csv = csv.reader(f)
    header = next(f_csv)
    Row = namedtuple('Row', header)
    for r in f_csv:
        row = Row(*r)
        print(row.Symbol, row.Price)

AA 39.48
AIG 71.38
AXP 62.58


In [5]:
with open('data/stocks.csv') as f:
    f_csv = csv.DictReader(f)
    for row in f_csv:
        print(row['Symbol'], row['Price'])

AA 39.48
AIG 71.38
AXP 62.58


## 读写JSON数据

In [6]:
import json

data = {
    'name' : 'ACME',
    'share' : 100,
    'price' : 542.23
}

json_str = json.dumps(data)
json_str

'{"name": "ACME", "share": 100, "price": 542.23}'

In [7]:
data = json.loads(json_str)
data

{'name': 'ACME', 'share': 100, 'price': 542.23}

In [8]:
data['name'] = 'chun'
with open('data/data.json', 'w') as f:
    json.dump(data, f)

with open('data/data.json', 'r') as f:
    data = json.load(f)
data

{'name': 'chun', 'share': 100, 'price': 542.23}

## 解析简单的XML数据

In [9]:
from urllib.request import urlopen
from xml.etree.ElementTree import parse

u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)
for item in doc.iterfind('channel/item'):
    title = item.findtext('title')
    date = item.findtext('pubDate')
    link = item.findtext('link')
    print(title)
    print(date)
    print(link)
    print()

PSF GSoC students blogs: Weekly Check In - 6
Wed, 15 Jul 2020 20:04:20 +0000
https://blogs.python-gsoc.org/en/adityaa30s-blog/weekly-check-in-6-12/

Ian Ozsvald: Weekish notes
Wed, 15 Jul 2020 19:20:59 +0000
https://ianozsvald.com/2020/07/15/weekish-notes-2/

Janusworx: A Hundred Days of Code, Day 008 - Python Basics, Lists, Tuples, Dictionaries, Sets and Done!
Wed, 15 Jul 2020 14:02:58 +0000
https://janusworx.com/blog/a-hundred-days-of-code-day-008-python-basics-lists-tuples-dictionaries-sets-and-done/

Real Python: Pandas Project: Make a Gradebook With Python &amp; Pandas
Wed, 15 Jul 2020 14:00:00 +0000
https://realpython.com/pandas-project-gradebook/

Catalin George Festila: Python 3.8.3 : Lists in Python 3 - part 001.
Wed, 15 Jul 2020 13:23:18 +0000
http://python-catalin.blogspot.com/2020/07/python-383-lists-in-python-3-part-001.html

Andriy Kornatskyy: Python Templates Benchmark
Wed, 15 Jul 2020 10:37:08 +0000
http://mindref.blogspot.com/2012/10/python-templates-benchmark.html

An

## 增量式解析大型XML文件

In [10]:
from xml.etree.ElementTree import iterparse

def parse_and_remove(filename, path):
    path_parts = path.split('/')
    doc = iterparse(filename, ('start', 'end'))
    next(doc)

    tag_stacks = []
    elem_stacks = []
    for event, elem in doc:
        if event == 'start':
            tag_stacks.append(elem.tag)
            elem_stacks.append(elem)
        elif event == 'end':
            if tag_stacks == path_parts:
                yield elem
                elem_stacks[-2].remove(elem)
            try:
                tag_stacks.pop()
                elem_stacks.pop()
            except IndexError:
                pass

## 将字典转换为XML

In [11]:
from xml.etree.ElementTree import Element

def dict_to_xml(tag, d):
    elem = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    return elem

s = { 'name': 'GOOG', 'shares': 100, 'price':490.1 }
e = dict_to_xml('stock', s)
from xml.etree.ElementTree import tostring
tostring(e)

b'<stock><name>GOOG</name><shares>100</shares><price>490.1</price></stock>'

In [12]:
e.set('id', '1234')
tostring(e)

b'<stock id="1234"><name>GOOG</name><shares>100</shares><price>490.1</price></stock>'

## 解析和修改XML

In [13]:
from xml.etree.ElementTree import parse, Element

doc = parse('data/pred.xml')
root = doc.getroot()
root.remove(root.find('sri'))
root.remove(root.find('cr'))
e = Element('spam')
e.text = 'This is a test'
root.insert(2, e)
doc.write('data/newpred.xml', xml_declaration=True)

## 与关系型数据库的交互

In [14]:
stocks = [
    ('GOOG', 100, 490.1),
    ('AAPL', 50, 545.75),
    ('FB', 150, 7.45),
    ('HPQ', 75, 33.2),
]
import sqlite3
db = sqlite3.connect('data/database.db')
c = db.cursor()
c.execute('drop table if exists portfolio')
c.execute('create table portfolio (symbol text, share integer, price real)')
db.commit()

In [15]:
c.executemany('insert into portfolio values (?,?,?)',stocks)
db.commit()

In [16]:
for row in db.execute('select * from portfolio'):
    print(row)

('GOOG', 100, 490.1)
('AAPL', 50, 545.75)
('FB', 150, 7.45)
('HPQ', 75, 33.2)


In [17]:
min_price = 100
for row in db.execute('select * from portfolio where price >= ?', (min_price,)):
    print(row)

('GOOG', 100, 490.1)
('AAPL', 50, 545.75)


## 编码和解码十六进制

In [20]:
s = b'hello'
import binascii
h = binascii.b2a_hex(s)
h

b'68656c6c6f'

In [21]:
binascii.a2b_hex(h)

b'hello'

In [22]:
import base64
h = base64.b16encode(s)
h

b'68656C6C6F'

In [23]:
base64.b16decode(h)

b'hello'

## 编码解码Base64数据

In [24]:
s = b'hello'
import base64
a = base64.b64encode(s)
print(a)
print(base64.b64decode(a))

b'aGVsbG8='
b'hello'


## 读写二进制数组数据

In [26]:
from struct import Struct

def write_records(records, format, f):
    record_struct = Struct(format)
    for r in records:
        f.write(record_struct.pack(*r))

records = [ 
    (1, 2.3, 4.5),
    (6, 7.8, 9.0),
    (12, 13.4, 56.7)
    ]
with open('data/data.b', 'wb') as f:
    write_records(records, '<idd', f)

In [27]:
def read_records(format, f):
    record_struct = Struct(format)
    chunks = iter(lambda: f.read(record_struct.size),b'')
    return (record_struct.unpack(chunk) for chunk in chunks)

with open('data/data.b', 'rb') as f:
    for rec in read_records('<idd', f):
        print(rec)

(1, 2.3, 4.5)
(6, 7.8, 9.0)
(12, 13.4, 56.7)


## 读取嵌套和可变长二进制数据

In [29]:
polys = [
    [ (1.0, 2.5), (3.5, 4.0), (2.5, 1.5) ],
    [ (7.0, 1.2), (5.1, 3.0), (0.5, 7.5), (0.8, 9.0) ],
    [ (3.4, 6.3), (1.2, 0.5), (4.6, 9.2) ],
]

import struct
import itertools

def write_polys(filename, polys):
    flattened = list(itertools.chain(*polys))
    min_x = min(x for x, y in flattened)
    min_y = min(y for x, y in flattened)
    max_x = max(x for x, y in flattened)
    max_y = max(y for x, y in flattened)
    with open(filename, 'wb') as f:
        f.write(struct.pack('<iddddi',0x1234,min_x,min_y,max_x,max_y,len(polys)))
        for poly in polys:
            size = len(poly) * struct.calcsize('<dd')
            f.write(struct.pack('<i', size+4))
            for pt in poly:
                f.write(struct.pack('<dd', *pt))

write_polys('data/poly.bin', polys)

In [32]:
def read_polys(filename):
    with open(filename, 'rb') as f:
        header = f.read(40)
        file_code, min_x, min_y, max_x, max_y, num_polys = struct.unpack('<iddddi', header)
        polys = []
        for n in range(num_polys):
            pbytes, = struct.unpack('<i', f.read(4))
            poly = []
            for m in range(pbytes // 16):
                pt = struct.unpack('<dd', f.read(16))
                poly.append(pt)
            polys.append(poly)
        return polys

read_polys('data/poly.bin')

[[(1.0, 2.5), (3.5, 4.0), (2.5, 1.5)],
 [(7.0, 1.2), (5.1, 3.0), (0.5, 7.5), (0.8, 9.0)],
 [(3.4, 6.3), (1.2, 0.5), (4.6, 9.2)]]

In [35]:
import struct

class StructField:
    def __init__(self, format, offset):
        self.format = format
        self.offset = offset
    def __get__(self, instance, cls):
        if instance is None:
            return self
        else:
            r = struct.unpack_from(self.format, instance._buffer, self.offset)
            return r[0] if len(r) == 1 else r

class Structure:
    def __init__(self, bytedata):
        self._buffer = memoryview(bytedata)

class PolyHeader(Structure):
    file_code = StructField('<i', 0)
    min_x = StructField('<d', 4)
    min_y = StructField('<d', 12)
    max_x = StructField('<d', 20)
    max_y = StructField('<d', 28)
    num_polys = StructField('<i', 36)

f = open('data/poly.bin', 'rb')
phead = PolyHeader(f.read(40))
print(phead.file_code == 0x1234)
print(phead.min_x)
print(phead.min_y)
print(phead.max_x)
print(phead.max_y)

True
0.5
0.5
7.0
9.2


In [39]:
class StructureMeta(type):
    def __init__(self, clsname, bases, clsdict):
        fileds = getattr(self, '_fields_', [])
        byte_order = ''
        offset = 0
        for format, fieldname in fileds:
            if format.startswith(('<','>','!','@')):
                byte_order = format[0]
                format = format[1:]
            format = byte_order + format
            setattr(self, fieldname, StructField(format, offset))
            offset += struct.calcsize(format)
        setattr(self, 'struct_size', offset)

class Structure(metaclass=StructureMeta):
    def __init__(self, bytedata):
        self._buffer = bytedata
    
    @classmethod
    def from_file(cls, f):
        return cls(f.read(cls.struct_size))

class PolyHeader(Structure):
    _fields_ = [
        ('<i', 'file_code'),
        ('d', 'min_x'),
        ('d', 'min_y'),
        ('d', 'max_x'),
        ('d', 'max_y'),
        ('i', 'num_polys')
    ]

f = open('data/poly.bin', 'rb')
phead = PolyHeader.from_file(f)
print(phead.file_code == 0x1234)
print(phead.min_x)
print(phead.min_y)
print(phead.max_x)
print(phead.max_y)

True
0.5
0.5
7.0
9.2


In [40]:
class NestedStruct:
    def __init__(self, name, struct_type, offset):
        self.name = name
        self.struct_type = struct_type
        self.offset = offset
        
    def __get__(self, instance, cls):
        if instance is None:
            return self
        else:
            data = instance._buffer[self.offset:
            self.offset+self.struct_type.struct_size]
            result = self.struct_type(data)
            setattr(instance, self.name, result)
            return result

class StructureMeta(type):
    def __init__(self, clsname, bases, clsdict):
        fileds = getattr(self, '_fields_', [])
        byte_order = ''
        offset = 0
        for format, fieldname in fileds:
            if isinstance(format, StructureMeta):
                setattr(self, fieldname, NestedStruct(fieldname, format, offset))
                offset += format.struct_size
            else:
                if format.startswith(('<','>','!','@')):
                    byte_order = format[0]
                    format = format[1:]
                format = byte_order + format
                setattr(self, fieldname, StructField(format, offset))
                offset += struct.calcsize(format)
        setattr(self, 'struct_size', offset)

class Structure(metaclass=StructureMeta):
    def __init__(self, bytedata):
        self._buffer = bytedata
    
    @classmethod
    def from_file(cls, f):
        return cls(f.read(cls.struct_size))

class Point(Structure):
    _fields_ = [
        ('<d', 'x'),
        ('d', 'y')
        ]
    
class PolyHeader(Structure):
    _fields_ = [
        ('<i', 'file_code'),
        (Point, 'min'), # nested struct
        (Point, 'max'), # nested struct
        ('i', 'num_polys')
        ]

f = open('data/poly.bin', 'rb')
phead = PolyHeader.from_file(f)
print(phead.file_code == 0x1234)
print(phead.min.x)
print(phead.min.y)
print(phead.max.x)
print(phead.max.y)

True
0.5
0.5
7.0
9.2


In [41]:
class SizedRecord:
    def __init__(self, bytedata):
        self._buffer = memoryview(bytedata)
    
    @classmethod
    def from_file(cls, f, size_fmt, includes_size=True):
        sz_nbytes = struct.calcsize(size_fmt)
        sz_bytes = f.read(sz_nbytes)
        sz, = struct.unpack(size_fmt, sz_bytes)
        buf = f.read(sz - includes_size * sz_nbytes)
        return cls(buf)

    def iter_as(self, code):
        if isinstance(code, str):
            s = struct.Struct(code)
            for off in range(0, len(self._buffer), s.size):
                yield s.unpack_from(self._buffer, off)
        elif isinstance(code, StructureMeta):
            size = code.struct_size
            for off in range(0, len(self._buffer), size):
                data = self._buffer[off:off+size]
                yield code(data)

f = open('data/poly.bin', 'rb')
phead = PolyHeader.from_file(f)
polydata = [ SizedRecord.from_file(f, '<i') for n in range(phead.num_polys)]
for n, poly in enumerate(polydata):
    print('Polygon', n)
    for p in poly.iter_as('<dd'):
        print(p)

Polygon 0
(1.0, 2.5)
(3.5, 4.0)
(2.5, 1.5)
Polygon 1
(7.0, 1.2)
(5.1, 3.0)
(0.5, 7.5)
(0.8, 9.0)
Polygon 2
(3.4, 6.3)
(1.2, 0.5)
(4.6, 9.2)
