## Openpyxl - Iterating

* Iterating through
* Convert data into dict
* Convert data into Python class (dataclass)

In [None]:
import datetime
from dataclasses import dataclass, asdict
import json

import openpyxl

In [None]:
fn = '../data/iris_short.xlsx'

wb = openpyxl.load_workbook(filename=fn)

In [None]:
wb.sheetnames

In [None]:
# default way of opening a spreadsheet
ws = wb.active

In [None]:
ws.title

* retrieve data

In [None]:
#1
ws["A1"]

In [None]:
ws["A1"].value

In [None]:
#2
# note: with spreadsheet, you will see one-indexed notation
ws.cell(row=1, column=1)

In [None]:
ws.cell(row=1, column=1).value

### Iterating Through the Data
* slice
* get ranges
* generators
    * .iter_rows()
    * .iter_cols()

#### slice

In [None]:
# equivalent to pandas shape
ws.dimensions

In [None]:
ws["A1:C3"]

In [None]:
ws["A1":"C3"]

In [None]:
ws["A1:C3"] == ws["A1":"C3"]

#### ranges

In [None]:
# get all cells from column A
ws["A"]

In [None]:
# get all cells for a range of columns
ws["A:B"]

In [None]:
# get all cells from a single row
ws[1]

In [None]:
# get all cells for a range of rows
ws[1:4]

#### Generators

In [None]:
# iterating through rows, get one tuple per row selected
# additional arg Boolean values_only

for row in ws.iter_rows(min_row=1,
                        max_row=4,
                        min_col=1,
                        max_col=4,
                        # values_only=True,
                       ):
    print(row)

In [None]:
# iterating through cols, get one tuple per col selected
# additional arg Boolean values_only

for col in ws.iter_cols(min_row=1,
                        max_row=4,
                        min_col=1,
                        max_col=4,
                        # values_only=True,
                       ):
    print(col)

#### iterating rough the whole dataset
(shortcuts to using .iter_rows() and .iter_cols() without any arguments)
* .rows
* .columns

In [None]:
for row in ws.rows:
    print(row)

In [None]:
for col in ws.columns:
    print(col)

### Manipulate data using Python's default data structures

The result from all iterators is in the form of tuples --> transform it into other data structure

* transform tuples to dict

iterate over all the rows, pick the columns, and then store that in a dict

In [None]:
ws.dimensions

In [None]:
# header values
for value in ws.iter_rows(min_row=1,
                          max_row=1,
                          values_only=True):
    print(value)

In [None]:
# row values
for value in ws.iter_rows(min_row=2,
                          min_col=1,
                          max_col=5,
                          values_only=True):
    print(value)

### Convert data into dict

In [None]:
# generate row_values

def iter_row_vals(ws, min_row, max_row, min_col, max_col):
    for row in ws.iter_rows(
        min_row, max_row, min_col, max_col
    ):
        row_values = [c.value for c in row]
        if all(c is not None for c in row_values):
            yield row_values

In [None]:
row_values = iter_row_vals(
    ws, min_row=1, max_row=5, min_col=1, max_col=5
)

In [None]:
row_values

In [None]:
list(row_values)

* example for built-in index

In [None]:
hv = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', None, None, None]

In [None]:
# index of the first occurrence of None
hv.index(None)

In [None]:
# generate header_values and row_values (all list)

def iter_dicts(ws, min_row, max_row, min_col, max_col):
    row_iter = iter_row_vals(ws, min_row, max_row, min_col, max_col)
    raw_header_values = next(row_iter)
    try:
        # index of the first occurrence of None 
        first_blank = raw_header_values.index(None)
        header_values = raw_header_values[:first_blank]
    except ValueError:
        header_values = raw_header_values
    for row_values in row_iter:
        combined = zip(header_values, row_values)
        row_dict = dict(combined)
        yield row_dict

In [None]:
all_my_data_as_dicts = list(
    iter_dicts(
        ws, min_row=1, max_row=5, min_col=1, max_col=5,
    )
)

In [None]:
all_my_data_as_dicts

### Convert data into Python classes

In [None]:
# field (zero indexed)
SEPAL_LENGTH = 0
SEPAL_WIDTH = 1
PETAL_LENGTH = 2
PETAL_WIDTH = 3
SPECIES = 4

In [None]:
# good for fixed format (precise format)

@dataclass
class Iris:
    """class for keeping track of an items"""
    sepal_length: float
    sepal_width: float
    petal_length: float
    petal_width: float
    species: str

In [None]:
iris_list = []

for row in ws.iter_rows(min_row=2,
                        max_row=5,
                        values_only=True):
    iris_obj = Iris(sepal_length=row[SEPAL_LENGTH],
                    sepal_width=row[SEPAL_WIDTH],
                    petal_length=row[PETAL_LENGTH],
                    petal_width=row[PETAL_WIDTH],
                    species=row[SPECIES])
    iris_list.append(iris_obj)

In [None]:
iris_list

In [None]:
iris_list[0]

In [None]:
# TODO: Convert the dataclass instances to a dict