[Reference](https://medium.com/@mariusz_kujawski/python-for-data-engineering-6bd6140033d4)

In [1]:
# List
my_list = [1, 2, 3, 'a', 'b', 'c']
my_list.append(4)     # Adds 4 to the end of the list
my_list.remove('a')  # Removes the element 'a'

In [2]:
# Tuple
my_tuple = (1, 2, 3, 'a', 'b', 'c')
my_tuple

(1, 2, 3, 'a', 'b', 'c')

In [3]:
# Dictionary
thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

In [4]:
# Set
my_set = {1, 2, 3}
my_set.add(1)
my_set.add(1)
print(my_set)

{1, 2, 3}


In [5]:
# List Comprehension
# Without list comprehension
squares = []
for i in range(5):
    squares.append(i**2)
print(squares)

# With list comprehension
squares = []
squares = [i**2 for i in range(5)]
print(squares)

[0, 1, 4, 9, 16]
[0, 1, 4, 9, 16]


In [6]:
# Filter, Map, and Reduce
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Filter even numbers
even_numbers = list(filter(lambda x: x % 2 == 0, numbers))

In [7]:
def power2(val: int) -> int:
    return val*val

numbers = [1, 2, 3, 4, 5]

power_numbers = list(map(power2, numbers))
# OR
squared_numbers = list(map(lambda x: x**2, numbers))

In [8]:
from functools import reduce

words = ["apple", "banana", "orange", "apple", "grape", "banana"]

# Count the occurrences of each word
word_counts = reduce(lambda counts, word: {**counts, word: counts.get(word, 0) + 1}, words, {})

print(word_counts)
# Output: {'apple': 2, 'banana': 2, 'orange': 1, 'grape': 1}


numbers = [1, 2, 3, 4, 5]
product = reduce((lambda x, y: x * y), numbers)

{'apple': 2, 'banana': 2, 'orange': 1, 'grape': 1}


In [9]:
lista = ['a','b','c','d','e']
count = 0
for l in lista:
    print('Index:', count,' Value:', l)
    count+=1

Index: 0  Value: a
Index: 1  Value: b
Index: 2  Value: c
Index: 3  Value: d
Index: 4  Value: e


In [10]:
# Generators
text = """
transaction_id,user\r
1,aaa\r
\r
2,xx\r
3,ccc\r
\r
"""

def process_large_file(text):
        for line in text.split("\r"):
            # Process the line
            processed_line = line.strip().upper()

            if processed_line != "":
                # Yield the processed line
                yield processed_line

# Example usage

for processed_line in process_large_file(text):
    print(processed_line)

TRANSACTION_ID,USER
1,AAA
2,XX
3,CCC


In [11]:
# Decorators
def make_upper(function):
 def upper():
  f = function()
  print(f"this from orgin value: {f}")
  return f.upper()
 return upper

@make_upper #decorator
def helloworld():
 return "hello world"

print(helloworld())

this from orgin value: hello world
HELLO WORLD


In [12]:
import time

def retry(times, wait):

    def decorator(func):
        def newfn(*args, **kwargs):
            attempt = 0
            while attempt < times:
                try:
                    time.sleep(wait)
                    return func(*args, **kwargs)
                except Exception as e:
                    print(
                        'Exception thrown when attempting to run %s, attempt '
                        '%d of %d' % (func, attempt, times)
                    )
                    attempt += 1
            time.sleep(wait)
            return func(*args, **kwargs)
        return newfn
    return decorator

@retry(times=3, wait=2)
def get_from_rest():
    print('Try read data from rest API')

    raise ConnectionError ('Lack of connection')

get_from_rest()

Try read data from rest API
Exception thrown when attempting to run <function get_from_rest at 0x7b2dfbd5a3b0>, attempt 0 of 3
Try read data from rest API
Exception thrown when attempting to run <function get_from_rest at 0x7b2dfbd5a3b0>, attempt 1 of 3
Try read data from rest API
Exception thrown when attempting to run <function get_from_rest at 0x7b2dfbd5a3b0>, attempt 2 of 3
Try read data from rest API


ConnectionError: ignored

In [13]:
# Data Class
from dataclasses import dataclass


@dataclass #dataclass decorator
class cutomerD:
 name: str #Type Hints
 id: int
 surname: str

class customer:
 def __init__(self,name,aid,books):
  self.name = name
  self.id = aid
  self.surname = books

Obj1 = customer("Erick",1254,"Nowak")
Obj2 = customer("Erick",1254,"Nowak")

Obj3 = cutomerD("Erick",1254,"Nowak")
Obj4 = cutomerD("Erick",1254,"Nowak")


print("Difrence for debuging")
print(Obj1)
print(Obj3)

print("\nDifference in Equality Check")
print(Obj1==Obj2)
print(Obj3==Obj4)

Difrence for debuging
<__main__.customer object at 0x7b2e08c44d90>
cutomerD(name='Erick', id=1254, surname='Nowak')

Difference in Equality Check
False
True


In [14]:
from dataclasses import dataclass, astuple, asdict


@dataclass #dataclass decorator
class customer:
 name: str #Type Hints
 id: int
 surname: str


Obj1 = customer("Erick",1254,"Nowak")

print(astuple(Obj1))
print(asdict(Obj1))

# Output:
# ('Erick', 1254, 'Nowak')
# {'name': 'Erick', 'id': 1254, 'surname': 'Nowak'}


('Erick', 1254, 'Nowak')
{'name': 'Erick', 'id': 1254, 'surname': 'Nowak'}


In [15]:
# Concurrency vs. parallelism
import calendar
from concurrent.futures import ThreadPoolExecutor
import requests

def generate_dates(year, month):
    _, last_day = calendar.monthrange(year, month)

    dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, last_day + 1)]
    return dates

year = 2023
month = 10
result = generate_dates(year, month)
urls = []
for x in result:
    urls.append(f"http://api.nbp.pl/api/exchangerates/rates/a/gbp/{x}/")

def download_page(url):
    response = requests.get(url)
    return response.content

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(download_page, urls))

for result in results:
    print(result)

b'404 NotFound - Not Found - Brak danych'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"190/A/NBP/2023","effectiveDate":"2023-10-02","mid":5.3183}]}'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"191/A/NBP/2023","effectiveDate":"2023-10-03","mid":5.3195}]}'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"192/A/NBP/2023","effectiveDate":"2023-10-04","mid":5.3492}]}'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"193/A/NBP/2023","effectiveDate":"2023-10-05","mid":5.3057}]}'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"194/A/NBP/2023","effectiveDate":"2023-10-06","mid":5.3195}]}'
b'404 NotFound - Not Found - Brak danych'
b'404 NotFound - Not Found - Brak danych'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no":"195/A/NBP/2023","effectiveDate":"2023-10-09","mid":5.2897}]}'
b'{"table":"A","currency":"funt szterling","code":"GBP","rates":[{"no"

In [16]:
# Multiprocessing Pools
from multiprocessing import Pool
import time

def square(x):
    time.sleep(1)
    print(x)
    return x * 2


numbers = [1, 2, 3, 4, 5]

if __name__ == '__main__':
    with Pool(processes=2) as pool:
        results = pool.map(square, numbers)
    print(results)

12

3
4
5
[2, 4, 6, 8, 10]


In [18]:
# Concurrency
import asyncio
import time

async def sql_command1():
    await asyncio.sleep(2)
    print("Query executed 1")
    return {"col1": 1, "col2": 2}


async def sql_command2():
    await asyncio.sleep(3)
    print("Query executed 2")
    return {"col1": 1, "col2": 2}

async def sql_command3():
    await asyncio.sleep(3)
    print("Query executed 3")
    return {"col1": 1, "col2": 2}


async def main():

 start_time = time.time()

 sql1 = asyncio.create_task(sql_command1())
 sql2 = asyncio.create_task(sql_command2())
 sql3 = asyncio.create_task(sql_command3())

 res1 = await sql1
 res2 = await sql2
 res3 = await sql3

 end_time = time.time()
 exec_time = end_time - start_time
 print(f"re1 {res1}")
 print(f"re2 {res2}")
 print(f"re3 {res3}")
 print(f"total time {exec_time:.2f}")

if __name__ == "__main__":
     asyncio.run(main())

In [21]:
import aiohttp
import asyncio
import calendar

def generate_dates(year, month):
    _, last_day = calendar.monthrange(year, month)

    dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, last_day + 1)]
    return dates

year = 2023
month = 10
result = generate_dates(year, month)
urls = []
for x in result:
    urls.append(f"http://api.nbp.pl/api/exchangerates/rates/a/gbp/{x}/")

async def download_page(session, url):
    async with session.get(url) as r:
        return await r.text()


async def main():
    async with aiohttp.ClientSession() as session:
        datas = await asyncio.gather(*[download_page(session, u) for u in urls])
        for x in datas:
            print(x)

if __name__ == "__main__":
    loop = asyncio.new_event_loop()
    loop.run_until_complete(main())

In [22]:
# Integration with Cloud Storage
import gcsfs
fs = gcsfs.GCSFileSystem()

dest = "gs://dxxx/clients/clients.csv"

with fs.open(dest,"wb") as file:
  file.write("hello;csv;file")

In [23]:
import adlfs


fs = adlfs.AzureBlobFileSystem(account_name=os.environ["AZURE_STORAGE_ACCOUNT_NAME"])
local_filename = "landing/clients.csv"

with fs.open(local_filename, "wb") as f:
  f.write("hello;csv;file")

In [24]:
import s3fs

fs = s3fs.S3FileSystem(key=mykey, secret=mysecretkey)
bucket = "my-bucket"


files = fs.ls(bucket)

with s3.open('my-bucket/my-file.txt', 'rb') as f:
  print(f.read())

In [26]:
!pip install faker
from faker import Faker

# Generate synthetic client data
def generate_client_data(num_clients=100000):
    clients = []
    for client_num in range(1, num_clients + 1):
        client = {
            "client_number": client_num,
            "name": fake.name(),
            "email": fake.email(),
            "phone_number": fake.phone_number(),
            "bulding_number": fake.building_number(),
            "street_name": fake.street_name(),
            "postcode": fake.postcode(),
            "city": fake.city(),
            "state": fake.state(),
            "birth_date": fake.date_of_birth(minimum_age=18, maximum_age=90).strftime('%Y-%m-%d'),
            "credit_card_number" : fake.credit_card_number(card_type='mastercard'),
        }
        clients.append(client)
    return clients

Collecting faker
  Downloading Faker-20.1.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-20.1.0


In [27]:
import psycopg2
import pandas as pds
from sqlalchemy import create_engine

engine= create_engine('postgresql+psycopg2://test:@127.0.0.1', pool_recycle=3600)


with engine.connect() as conn:
  dataFrame = pds.read_sql("select * from tab1", conn)
df = pd.read_excel('sample.xlsx')
df.to_sql('table_name', con=engine, if_exists='append', index= False)

In [28]:
# Unit Tests

import unittest

class ExampleTestSuite(unittest.TestCase):

    def test_import(self):
        self.assertTrue(True)

    def test_addition(self):
        self.assertEqual(1 + 2, 3)

    def test_subtraction(self):
        self.assertNotEqual(1 - 2, 0)

if __name__ == '__main__':
    unittest.main()

E
ERROR: /root/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/root/'

----------------------------------------------------------------------
Ran 1 test in 0.004s

FAILED (errors=1)


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [29]:
# Pytest

def test_import():
    assert True


def test_addition():
    assert 1 + 2 == 3


def test_subtraction():
    assert 1 - 2 != 0

In [30]:
# Mocking
# app.py

import requests
import json


def get_currency(url):
    response = requests.get(url)
    return response.text

def get_currency_rate():
    url = "http://api.nbp.pl/api/exchangerates/rates/a/gbp/2023-11-02/"
    currency = json.loads(get_currency(url))
    return currency["rates"][0]["mid"]


if __name__ == "__main__":
    print(get_currency_rate())

5.1135


In [32]:
import unittest
from unittest.mock import patch, MagicMock

class TestImports(unittest.TestCase):
    text = '{"table": "A", "currency": "funt szterling", "code": "GBP", "rates": [{"no": "212/A/NBP/2023", "effectiveDate": "2023-11-02", "mid": 5.1135}]}'

    @patch("app.get_currency")
    def test_import_currency(self, mock_get_currency):
        mock_get_currency.return_value = self.text
        self.assertEqual(get_currency_rate(), 5.1135)

    @patch("app.requests")
    def test_get_currency(self, mock_requests):
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_requests.get.return_value = mock_response

        self.assertEqual(get_currency("test"), '{"rates": [{"mid": ""}]}')


if __name__ == "__main__":
    unittest.main()

E
ERROR: /root/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/root/'

----------------------------------------------------------------------
Ran 1 test in 0.005s

FAILED (errors=1)


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [33]:
# Data Frame testing
# app.py

import polars as pl


def filter_dataframe(df: pl.DataFrame) -> pl.DataFrame:
    return df.filter(pl.col("col2") == "a")


def add_col_to_df(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns((pl.col("col1") ** 2).alias("col1_power"))


if __name__ == "__main__":
    df = pl.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
    print(add_col_to_df(df))

shape: (3, 3)
┌──────┬──────┬────────────┐
│ col1 ┆ col2 ┆ col1_power │
│ ---  ┆ ---  ┆ ---        │
│ i64  ┆ str  ┆ f64        │
╞══════╪══════╪════════════╡
│ 1    ┆ a    ┆ 1.0        │
│ 2    ┆ b    ┆ 4.0        │
│ 3    ┆ c    ┆ 9.0        │
└──────┴──────┴────────────┘


In [34]:
import pytest
import polars as pl

@pytest.fixture
def test_dataframe() -> pl.DataFrame:

    return pl.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})


def test_filter_dataframe(test_dataframe):
    df = filter_dataframe(test_dataframe)
    rows_count_b = df.filter(pl.col("col2") == "b").select(pl.count()).item()
    rows_count_c = df.filter(pl.col("col2") == "c").select(pl.count()).item()
    assert rows_count_b == 0
    assert rows_count_c == 0


def test_add_col_to_df(test_dataframe):

    actual_df = add_col_to_df(test_dataframe)

    assert actual_df.columns == ["col1", "col2", "col1_power"]
    assert actual_df.rows() == [(1, "a", 1), (2, "b", 4), (3, "c", 9)]