# Exercise 0

## 0. User input

In [10]:
source_path = input("Enter a file path for your source data")
destination_path = input("Enter a file path for your destination data")

print(f"{'source:':<20} {source_path}")
print(f"{'destination:':<20} {destination_path}")


source:              /Users/aigineer/Documents/teaching repos/data_platform_course/exercises/test.csv
destination:         /Users/aigineer/Documents/teaching repos/data_platform_course/exercises/output.txt


## 1. Schema validation 

In [19]:
# a)
data = dict(id=101, name="Erika", is_active=True, age=45)
data

{'id': 101, 'name': 'Erika', 'is_active': True, 'age': 45}

In [20]:
# b)
schema = {"id": int, "name": str, "is_active": bool, "age": int}

all([isinstance(data[column], schema[column]) for column in schema])

True

In [21]:
# c)
data = [
    {"id": 102, "name": "Marcus", "is_active": True, "age": 34},
    {"id": 103, "name": "David", "is_active": False, "age": 29},
    {"id": 104, "name": "Anna", "is_active": True, "age": 41.5},
    {"id": 106, "name": "Ingrid", "is_active": "NOPE", "age": 8},
]

data

[{'id': 102, 'name': 'Marcus', 'is_active': True, 'age': 34},
 {'id': 103, 'name': 'David', 'is_active': False, 'age': 29},
 {'id': 104, 'name': 'Anna', 'is_active': True, 'age': 41.5},
 {'id': 106, 'name': 'Ingrid', 'is_active': 'NOPE', 'age': 8}]

In [22]:
# d) Schema validation of c)

def schema_validate(data, schema):
    return all([isinstance(data[column], schema[column]) for column in schema])

validated_records = [schema_validate(record, schema) for record in data]
print(validated_records)

print(f"The data is valid: {all(validated_records)}")

[True, True, False, False]
The data is valid: False


In [23]:
# e) 
# look at previous functions 

## 2. Check list length

In [24]:

def check_list_length(list_):
    if len(list_) != 10:
        raise ValueError(f"The provided list must be 10 in length and not {len(list_)}")

try:    
    check_list_length([1,2,3])
except ValueError as err:
    print(err)

The provided list must be 10 in length and not 3


In [25]:
check_list_length(list(range(10)))

## 3. Extract data from logs

In [37]:
with open("../data/network.log", "r") as logs: 
    logs = logs.readlines()

In [38]:
logs

['2024-06-01 09:00:00 | Source: 10.0.0.1 | Destination: 10.0.0.2 | Protocol: TCP | Bytes: 1024\n',
 '2024-06-01 09:05:00 | Source: 10.0.0.2 | Destination: 10.0.0.3 | Protocol: UDP | Bytes: 2048\n',
 '2024-06-01 09:10:00 | Source: 10.0.0.3 | Destination: 10.0.0.1 | Protocol: TCP | Bytes: 512\n']

In [39]:
protocols = [log.split("|")[-2][-4:-1] for log in logs]
bytes = [log.split("|")[-1][-5:-1] for log in logs]
bytes

['1024', '2048', ' 512']

In [42]:
from collections import defaultdict

network_data = defaultdict(int)

for protocol, byte in zip(protocols, bytes):
    network_data[protocol] += int(byte)

network_data

defaultdict(int, {'TCP': 1536, 'UDP': 2048})

next step is to format it into a string and join ...

## 4. Aggregating json data

In [32]:
# a)
import json

with open("../data/paid.json", "r") as file:
    json_data = json.load(file)

json_data[:5]

[{'name': 'Alice', 'paid': 100},
 {'name': 'Bob', 'paid': 200},
 {'name': 'Alice', 'paid': 150},
 {'name': 'Erik', 'paid': 180},
 {'name': 'Maja', 'paid': 220}]

In [33]:
from collections import defaultdict

grouped_data = defaultdict(int)

for record in json_data:
    grouped_data[record['name']] += record["paid"]

grouped_data

defaultdict(int,
            {'Alice': 745,
             'Bob': 640,
             'Erik': 570,
             'Maja': 650,
             'Oskar': 300,
             'Elsa': 320,
             'Liam': 380,
             'Astrid': 190,
             'Freja': 620,
             'Hugo': 250,
             'Nils': 130,
             'Wilma': 200,
             'Alexander': 260,
             'Clara': 280,
             'Selma': 175,
             'Elias': 190})

In [34]:
with open("payment_sum.json", "w") as file:
    json.dump(grouped_data, file)

## 5 Simulating data streams

In [35]:
import time 

simualated_stream = ["record1", "record2", "record3", "record4", "STOP", "record5"]

for chunk in simualated_stream:
    if chunk == "STOP":
        print("STOP signal encountered. Ending stream processing.")
        break

    print(f"Processed: {chunk}")
    time.sleep(1)

print("Data stream processing completed. Have a nice day")


Processed: record1
Processed: record2
Processed: record3
Processed: record4
STOP signal encountered. Ending stream processing.
Data stream processing completed. Have a nice day
