# Seminar 2: Practical Error Handling & I/O

## Working with files

### Reading

In [50]:
# Bad example (outdated)
f = open('tmp_data.json', 'r')
content = f.read() # Reads entire content of the file
print(content)
f.close()

{
  "a": "1",
  "b": "2"
}


In [51]:
# Good example
with open('tmp_data.json', 'r') as f:
    content = f.read() # Reads entire content of the file
print(content)

{
  "a": "1",
  "b": "2"
}


In [21]:
with open('tmp_data.json', 'r') as f:
    print(f)
    print(f"{f.name=}, {f.mode=}, {f.encoding=}")

<_io.TextIOWrapper name='tmp_data.json' mode='r' encoding='UTF-8'>
f.name='tmp_data.json', f.mode='r', f.encoding='UTF-8'


In [20]:
with open('tmp_data.json', 'r') as f:
    line = f.readline()
    print(len(line))
    line = f.readline()
    print(len(line))
    line = f.readline()
    print(len(line))
    line = f.readline()
    print(len(line))
    line = f.readline()
    print(len(line))
    line = f.readline()
    print(len(line))
    line = f.readline() # This can go indefinitely
    print(len(line))

2
14
13
1
0
0
0


In [18]:
with open('tmp_data.json', 'r') as f:
    while len(line := f.readline()) > 0:
        print(line)

{

    "a": "1",

    "b": "2"

}


In [34]:
with open('tmp_data.json', 'r') as f:
    for line in f.readlines():
        print(line.replace('\n', '\\n'))
    
    print('--------------------')
    for line in f.readlines():
        print('This will never be printed')
        print(line)
    f.seek(0)
    for line in f.readlines():
        print('Now this will be printed')
        print(line.replace('\n', '\\n'))

{\n
    "a": "1",\n
    "b": "2"\n
}
--------------------
Now this will be printed
{\n
Now this will be printed
    "a": "1",\n
Now this will be printed
    "b": "2"\n
Now this will be printed
}


In [95]:
import json

# Better way to load JSONs
with open('tmp_data.json', 'r') as f:
    data = json.load(f)
print(data)

{'a': '1', 'b': '2'}


### Writing

In [23]:
with open('tmp_data.json', 'w') as f: # This erases the content of the file
    pass

with open('tmp_data.json', 'r') as f:
    content = f.read()
print(content)
print(len(content))


0


In [94]:
# This is how you can write arbitrary data to the file
with open('tmp_data.json', 'w') as f:
    f.write('{\n')
    f.write('    "a": "1",\n')
    f.write('    "b": "2"\n')
    f.write('}')

In [38]:
import json

# But for JSON files, you can make it more simple:
data = {
    "a": 1,
    "b": 2
}

print('json.dumps(data)')
print(json.dumps(data))
print()
print()

with open('tmp_data.json', 'w') as f:
    f.write(json.dumps(data)) # Acceptable

with open('tmp_data.json', 'r') as f:
    print(f.read())

with open('tmp_data.json', 'w') as f:
    json.dump(data, f) # More concise

with open('tmp_data.json', 'r') as f:
    print(f.read())

# Notice that now it's all in 1 line

json.dumps(data)
{"a": 1, "b": 2}


{"a": 1, "b": 2}
{"a": 1, "b": 2}


### Appending

In [39]:
with open('tmp_data.json', 'a') as f:
    f.write('\n')
    f.write(json.dumps({ 'c': 3, 'd': 4 }))

with open('tmp_data.json', 'r') as f:
    print(f.read())

{"a": 1, "b": 2}
{"c": 3, "d": 4}


## Exceptions

### Examples

In [4]:
def some_complex_function(x: int) -> None:
    a = 1 / x
    ...
    return

some_complex_function(0)

ZeroDivisionError: division by zero

In [25]:
obj: str | None = None

if some_long_expression := False:
    obj = "string"

print(len(obj))

TypeError: object of type 'NoneType' has no len()

In [19]:
a = "1"
print(a + "2")

a = 1
print(a + "2")

12


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [20]:
my_dict = { 'a' : 1, 'b': 2, 'c': 3 }

print(my_dict['d'])

KeyError: 'd'

In [21]:
my_list = [1, 2, 3, 4, 5]
print(my_list[5])

IndexError: list index out of range

In [23]:
num = int("42")
print(num)
another_num = int("42s")
print(another_num)

42


ValueError: invalid literal for int() with base 10: '42s'

In [24]:
def infinite_recursion():
    return infinite_recursion()

infinite_recursion()

RecursionError: maximum recursion depth exceeded

In [52]:
with open('non-existing-file.txt', 'r') as f:
    content = f.readline()
    print(content)

FileNotFoundError: [Errno 2] No such file or directory: 'non-existing-file.txt'

In [40]:
with open('non-existing-file.txt', 'w') as f:
    print(f.read())

UnsupportedOperation: not readable

In [41]:
with open('tmp_data.json', 'r') as f:
    f.write('\n')

UnsupportedOperation: not writable

In [53]:
import os
os.remove('non-existing-file.txt')

FileNotFoundError: [Errno 2] No such file or directory: 'non-existing-file.txt'

In [45]:
# Very common
def future_method():
    raise NotImplementedError()

future_method()

NotImplementedError: 

### try..except..else..finally

In [44]:
import builtins
[builtin for builtin in dir(builtins) if builtin.endswith('Error') or builtin.endswith('Exception')]

# This shows all builtin exceptions

['ArithmeticError',
 'AssertionError',
 'AttributeError',
 'BaseException',
 'BlockingIOError',
 'BrokenPipeError',
 'BufferError',
 'ChildProcessError',
 'ConnectionAbortedError',
 'ConnectionError',
 'ConnectionRefusedError',
 'ConnectionResetError',
 'EOFError',
 'EnvironmentError',
 'Exception',
 'FileExistsError',
 'FileNotFoundError',
 'FloatingPointError',
 'IOError',
 'ImportError',
 'IndentationError',
 'IndexError',
 'InterruptedError',
 'IsADirectoryError',
 'KeyError',
 'LookupError',
 'MemoryError',
 'ModuleNotFoundError',
 'NameError',
 'NotADirectoryError',
 'NotImplementedError',
 'OSError',
 'OverflowError',
 'PermissionError',
 'ProcessLookupError',
 'RecursionError',
 'ReferenceError',
 'RuntimeError',
 'SyntaxError',
 'SystemError',
 'TabError',
 'TimeoutError',
 'TypeError',
 'UnboundLocalError',
 'UnicodeDecodeError',
 'UnicodeEncodeError',
 'UnicodeError',
 'UnicodeTranslateError',
 'ValueError',
 'ZeroDivisionError']

In [51]:
try:
    raise ValueError('some message')
except ValueError as e:
    print('Caught ValueError')
    print(e)
except Exception as e:
    print('Caught any other exception')
    print(e)
else: # Very rarely used in practice
    print('No exceptions were raised in the main block')
finally:
    print('This block will execute no matter what')

Caught ValueError
some message
This block will execute no matter what


In [49]:
try:
    raise NotImplementedError("not implemented")
except ValueError as e:
    print('Caught ValueError')
    print(e)
except Exception as e:
    print('Caught any other exception')
    print(e)
else:
    print('No exceptions were raised in the main block')
finally:
    print('This block will execute no matter what')

Caught any other exception
not implemented
This block will execute no matter what


In [50]:
try:
    pass
except ValueError as e:
    print('Caught ValueError')
    print(e)
except Exception as e:
    print('Caught any other exception')
    print(e)
else:
    print('No exceptions were raised in the main block')
finally:
    print('This block will execute no matter what')

No exceptions were raised in the main block
This block will execute no matter what


### Handling Exceptions

In [28]:
# Bad example
try:
    some_complex_function(0)
except Exception as e:
    print('CAUGHT EXCEPTION')
    print(e)
    
# Good example
try:
    some_complex_function(0)
except ZeroDivisionError as e:
    print('CAUGHT ZERO DIVISION EXCEPTION')
    print(e)
except Exception as e:
    print('CAUGHT UNEXPECTED EXCEPTION')
    raise

CAUGHT EXCEPTION
division by zero
CAUGHT ZERO DIVISION EXCEPTION
division by zero


In [44]:
import os

def bad_processing_func(x: int):
    with open('tmp.txt', 'w') as f:
        f.write('some temporary data required inside the function')
    
    b = 1 / x
    # Clearing temporary data at the end
    print('CLEANUP')
    os.remove('tmp.txt')
    return b

# Good example
def good_processing_func(x: int):
    with open('tmp.txt', 'w') as f:
        f.write('some text')
    try:
        b = 1 / x
        return b
    finally:
        # Executes even after the "return" statement
        # Always do try..finally when dealing with temporary resources
        print('CLEANUP')
        os.remove('tmp.txt')

In [45]:
bad_processing_func(1)
assert not os.path.exists('tmp.txt')

CLEANUP


In [46]:
try:
    bad_processing_func(0)
except ZeroDivisionError:
    pass
assert not os.path.exists('tmp.txt')

AssertionError: 

In [47]:
good_processing_func(1)
assert not os.path.exists('tmp.txt')

CLEANUP


In [48]:
try:
    good_processing_func(0)
except ZeroDivisionError:
    pass
assert not os.path.exists('tmp.txt')

CLEANUP


## Exercise 1: Parsing Product Distributors Data


In this exercise, our goal is to parse distributor information from mayco-distributors.csv file.
For each distributor, we need to parse:
- title
- street_number
- street_name
- city
- state_province
- country
- postal_code
- website
- latitude
- longitude

Whenever parsing random data files from the internet, be aware that they might be malformed. Good parser should skip malformed distributors and still parse the correct ones. 

In [55]:
import csv
from typing import NamedTuple


class Distributor(NamedTuple):
    title: str | None
    street_number: str | None
    street_name: str | None
    city: str | None
    state_province: str | None
    country: str | None
    postal_code: str | None
    website: str | None
    latitude: float | None
    longitude: float | None

In [56]:
# Bad example
def bad_parse_distributors(file_path: str) -> list[Distributor]:
    result = []
    with open(file_path, 'r') as f:
        csv_reader = csv.reader(f)
        header = next(csv_reader)
        for row in csv_reader:
            distributor = Distributor(
                title=row[1],
                street_number=row[2],
                street_name=row[3],
                city=row[4],
                state_province=row[5],
                country=row[6],
                postal_code=row[7],
                website=row[8],
                latitude=float(row[16]),
                longitude=float(row[17])
            )
            result.append(distributor)
    return result

In [62]:
distributors = bad_parse_distributors("mayco-distributors.csv")

ValueError: could not convert string to float: ''

In [68]:
# Good example
def good_parse_distributors(file_path: str) -> list[Distributor]:
    result = []
    with open(file_path, 'r') as f:
        csv_reader = csv.reader(f)
        next(csv_reader)
        
        def parse_string(index: int) -> str | None:
            if len(row) <= index:
                return None
            value = row[index]
            if len(value) == 0:
                return None
            return value.strip()
        
        def parse_float(index: int) -> float | None:
            value_str = parse_string(index)
            if value_str is None:
                return None
            try:
                return float(value_str)
            except ValueError:
                return None
        
        for row in csv_reader:
            distributor = Distributor(
                title=parse_string(1),
                street_number=parse_string(2),
                street_name=parse_string(3),
                city=parse_string(4),
                state_province=parse_string(5),
                country=parse_string(6),
                postal_code=parse_string(7),
                website=parse_string(8),
                latitude=parse_float(16),
                longitude=parse_float(17)
            )
            result.append(distributor)
        
    return result

In [69]:
# Bonus: really good example
from typing import Generator

def really_good_parse_distributors(file_path: str) -> Generator[Distributor, None, None]:
    with open(file_path, 'r') as f:
        csv_reader = csv.reader(f)
        next(csv_reader)
        
        def parse_string(index: int) -> str | None:
            if len(row) <= index:
                return None
            value = row[index]
            if len(value) == 0:
                return None
            return value.strip()
        
        def parse_float(index: int) -> float | None:
            value_str = parse_string(index)
            if value_str is None:
                return None
            try:
                return float(value_str)
            except ValueError:
                return None
        
        for row in csv_reader:
            yield Distributor(
                title=parse_string(1),
                street_number=parse_string(2),
                street_name=parse_string(3),
                city=parse_string(4),
                state_province=parse_string(5),
                country=parse_string(6),
                postal_code=parse_string(7),
                website=parse_string(8),
                latitude=parse_float(16),
                longitude=parse_float(17)
            )

In [73]:
distributors = good_parse_distributors('mayco-distributors.csv')
distributors[:3]

[Distributor(title='Zeramics Importacao e Comercio', street_number=None, street_name=None, city='Sao Paulo', state_province=None, country='Brazil', postal_code='01457-060', website='https://www.zeramics.com.br/', latitude=-23.5736958, longitude=-46.6929462),
 Distributor(title='Your Ceramic Store', street_number='Silicon Oasis, SIT Tower, Office 602', street_name='P O Box 430090', city='Dubai', state_province=None, country='United Arab Emirates', postal_code=None, website='https://yourceramicstore.com/', latitude=25.11987, longitude=55.3867),
 Distributor(title='Yaro975', street_number=None, street_name=None, city='Montevideo', state_province=None, country='Uruguay', postal_code='11100', website='https://yaro975.com.uy', latitude=-34.90652, longitude=-56.19418)]

## Exercise 2: Validating Distributors


So, we have parsed all distributors. Now it's time to validate them.
Suppose, you only care about the following fields:
- title
- website (must be valid url)
- latitude, longitude (must be valid coordinates)

The goal is to write validating function that will accept distributor and raise custom exception, which indicates error in one of the required fields. Then, save validated distributors to JSON.

In [74]:
from urllib.parse import urlparse


class ValidationError(Exception):
    def __init__(self, message: str, field: str, value):
        super().__init__(f"{field}: {message} | Value: {value}")
        self.field = field
        self.value = value
    

def validate_distributor(distributor: Distributor) -> None:
    if distributor.title is None:
        raise ValidationError("is None", "title", distributor.title)
    
    if distributor.website is None:
        raise ValidationError("is None", "website", distributor.website)
    try:
        urlparse(distributor.website)
    except ValueError as e:
        raise ValidationError(f"is invalid url: {e}", "website", distributor.website)
    
    if distributor.latitude is None:
        raise ValidationError("is None", "latitude", distributor.latitude)
    if distributor.latitude > 90 or distributor.latitude < -90:
        raise ValidationError("outside valid range", "latitude", distributor.latitude)
    
    if distributor.longitude is None:
        raise ValidationError("is None", "longitude", distributor.longitude)
    if distributor.longitude > 180 or distributor.longitude < -180:
        raise ValidationError("outside valid range", "longitude", distributor.longitude)

In [75]:
def is_distributor_valid(distributor: Distributor) -> bool:
    try:
        validate_distributor(distributor)
        return True
    except ValidationError as e:
        print(e)
        return False
    
valid_distributors = list(filter(is_distributor_valid, distributors))
print(f"{len(valid_distributors)} out of {len(distributors)} distributors are valid")

website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
latitude: outside valid range | Value: 532.81729
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
latitude: is None | Value: None
website: is None | Value: None
website: is None | Value: None
website: is None | Value: None
274 out of 294 distributors are valid


In [76]:
import json
with open('valid_distributors.json', 'w') as f:
    json.dump(list(map(lambda x: x._asdict(), valid_distributors)), f)

## Exercise 3: Reducing the Number of Points in GeoJSON Polygon File.


In this exercise, your will be working with GeoJSON format.

GeoJSON is a simple json, but for displaying some objects on the map.
To display it, we will use folium package (display_folium function is already given, that does all the job).

In order to reduce number of points, we will select only points from the convex hull (https://en.wikipedia.org/wiki/Convex_hull) of the polygon. This is also already implemented in get_convex_hull function

So, we need to do the following:

1) Understand the structure of the GeoJSON file and extract the coordinates of the polygon
2) Compute the coordinates for the center of the polygon (required for visualization)
3) Call get_convex_hull and save updated polygon to GeoJSON format, and visualize the result.

In [89]:
import folium
from scipy.spatial import ConvexHull
import numpy as np

def display_folium(
        center_lat: float,
        center_lng: float,
        geojson_path: str
) -> None:
    m = folium.Map(
        location=[center_lat, center_lng], 
        zoom_start=16,
        zoom_control=False,
        dragging=False,
        touch_zoom=False,
        scrollWheelZoom=False,
        doubleClickZoom=False,
        boxZoom=False
    )
    folium.GeoJson(
        geojson_path
    ).add_to(m)
    return m


def get_convex_hull(points: list[tuple[float, float]]) -> list[tuple[float, float]]:
    points_np = np.array(points)
    hull = ConvexHull(points_np)
    return list(map(list, points_np[hull.vertices]))

In [79]:
import json

with open('geo.json', 'r') as f:
    geo = json.load(f)

In [80]:
geo

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {},
   'geometry': {'coordinates': [[[8.650428389537126, 53.17119270803312],
      [8.650295332158123, 53.17064769291278],
      [8.650073570430777, 53.170102670466264],
      [8.650095746603512, 53.16957093457904],
      [8.649541342283413, 53.16963740192517],
      [8.649053466482428, 53.1697038691683],
      [8.648011186361344, 53.169730456037286],
      [8.648011186361344, 53.16955764109747],
      [8.647989010188638, 53.169025898455686],
      [8.647745072287677, 53.16844097393982],
      [8.64743460586854, 53.16780286537423],
      [8.647212844141194, 53.1671381588618],
      [8.64719066796846, 53.16663297502603],
      [8.64714631562211, 53.16604801789262],
      [8.64701325858573, 53.16548964228136],
      [8.64714631562211, 53.164811604986795],
      [8.64823294808869, 53.164864784769435],
      [8.649474813765266, 53.164598885196625],
      [8.651736783389254, 53.164439344662014],
      [8.651936

In [81]:
points = geo['features'][0]['geometry']['coordinates'][0]

In [82]:
summ_lat = 0
summ_lng = 0
count = 0

for lng, lat in points:
    summ_lat += lat
    summ_lng += lng
    count += 1
    
center_lat = summ_lat / count
center_lng = summ_lng / count

center_lat, center_lng

(53.1678519464429, 8.651655792326954)

In [90]:
display_folium(center_lat, center_lng, 'geo.json')

In [91]:
import copy

new_geo = copy.deepcopy(geo)

new_geo['features'][0]['geometry']['coordinates'][0] = get_convex_hull(points)
new_geo

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {},
   'geometry': {'coordinates': [[[8.656526837050933, 53.16681909567589],
      [8.65526279520168, 53.16909236624355],
      [8.651492845830973, 53.17123258694642],
      [8.650428389537126, 53.17119270803312],
      [8.648011186361344, 53.169730456037286],
      [8.647212844141194, 53.1671381588618],
      [8.64701325858573, 53.16548964228136],
      [8.64714631562211, 53.164811604986795],
      [8.649474813765266, 53.164598885196625],
      [8.651736783389254, 53.164439344662014],
      [8.654863623752505, 53.16477172010656],
      [8.655706318657224, 53.16501102842997],
      [8.656172018286355, 53.16538328379559],
      [8.656504660877317, 53.16623414107889]]],
    'type': 'Polygon'}}]}

In [92]:
with open('geo_new.json', 'w') as f:
    json.dump(new_geo, f)

In [93]:
display_folium(center_lat, center_lng, 'geo_new.json')