# Imports

In [None]:
import json
from cgi import print_environ_usage

import pandas as pd
import inspect

# Built-in Functions

## Helpers

In [None]:
help(zip)

# Output
# Help on class zip in module builtins:
#
# class zip(object)
# |  zip(*iterables, strict=False) --> Yield tuples until an input is exhausted.
# |
# |     >>> list(zip('abcdefg', range(3), range(4)))
# |     [('a', 0, 0), ('b', 1, 1), ('c', 2, 2)]
# |
# |  The zip object yields n-length tuples, where n is the number of iterables
# |  passed as positional arguments to zip().  The i-th element in every tuple
# |  comes from the i-th iterable argument to zip().  This continues until the
# |  shortest argument is exhausted.
# |

In [None]:
dir(zip)

# Output
# ['__class__',
#  '__delattr__',
#  '__dir__',
#  '__doc__',
#  '__eq__',
#  '__format__',
#  '__ge__',
#   ....
#  '__next__',
#  '__setstate__',
#  '__sizeof__',
#  '__str__',
#  '__subclasshook__']

In [None]:
inspect.signature(pd.read_csv)

# Output
# <Signature (filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols: 'UsecolsArgType' = None, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters: 'Mapping[Hashable, Callable] | None' = None, true_values: 'list | None' = None, false_values: 'list | None' = None, skipinitialspace: 'bool' = False, skiprows: 'list[int] | int | Callable[[Hashable], bool] | None' = None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values: 'Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None' = None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool | lib.NoDefault' = <no_default>, skip_blank_lines: 'bool' = True, parse_dates: 'bool | Sequence[Hashable] | None' = None, infer_datetime_format: 'bool | lib.NoDefault' = <no_default>, keep_date_col: 'bool | lib.NoDefault' = <no_default>, date_parser: 'Callable | lib.NoDefault' = <no_default>, date_format: 'str | dict[Hashable, str] | None' = None, dayfirst: 'bool' = False, cache_dates: 'bool' = True, iterator: 'bool' = False, chunksize: 'int | None' = None, compression: 'CompressionOptions' = 'infer', thousands: 'str | None' = None, decimal: 'str' = '.', lineterminator: 'str | None' = None, quotechar: 'str' = '"', quoting: 'int' = 0, doublequote: 'bool' = True, escapechar: 'str | None' = None, comment: 'str | None' = None, encoding: 'str | None' = None, encoding_errors: 'str | None' = 'strict', dialect: 'str | csv.Dialect | None' = None, on_bad_lines: 'str' = 'error', delim_whitespace: 'bool | lib.NoDefault' = <no_default>, low_memory: 'bool' = True, memory_map: 'bool' = False, float_precision: "Literal['high', 'legacy'] | None" = None, storage_options: 'StorageOptions | None' = None, dtype_backend: 'DtypeBackend | lib.NoDefault' = <no_default>) -> 'DataFrame | TextFileReader'>

## Map

In [None]:
lst_str = ['1', '2', '3']
[print(type(i)) for i in lst_str]

# Return an iterator that applies function to every item of iterable, yielding the results
lst_int = map(int, lst_str)
[print(type(i)) for i in lst_int]

# What is happennig
# Map is iterating throw every item on lst_str, applying the function list
# passing the value as a parameter and returning the value return by this function
# A map object exhibits generator-like behavior: it is lazy and produces values on 
# demand as you iterate over it.

# ____getitem____

In [None]:
# When we access an object with square brackets, it automatically calls __getitem__ method.
# This is the approach that Pandas use in the Dataframe method .loc[].
# Pandas .loc is a property that returns an object (LocIndexer) that implements the method
# __getitem__. This is very flexible because the parameter if this method accepts any kind of
# types and can be implemented the work like and array index accessor or something more complex

class AccessBySquareBrackets:
    def __getitem__(self, item):
        return f"You've passed {item}"

obj = AccessBySquareBrackets()
print(obj['Alisson'])

# Iterators / Iterables

In [None]:
# Iterators are objects that can be iterated over. When you use `for i in x`, x is an iterator.
# The code below is an example of a custom object that is an Iterator and an Iterable
# Iterator = Object that returns and iterable (implement __iter__)
# Iterable = Object that can be iterated (implement __next__)
class MyIterator:
    def __init__(self):
        self.value = 'Alisson'
        self.current = 0
        self.max = len(self.value)

    def __iter__(self):
        return self

    def __next__(self):
        if self.current < self.max:
            curr_value = self.value[self.current]
            self.current += 1
            return curr_value

        raise StopIteration


iterator = MyIterator()

for i in iterator:
    print(i, end='')

# File Handling

In [None]:
file = open('data/people.json')

In [None]:
help(file)

## Load JSON file

In [None]:
with open('data/people.json') as f:
    data = json.load(f)

In [None]:
data
# [{'name': 'Alice', 'age': 28},
#  {'name': 'Bob', 'age': 34},
#  {'name': 'Charlie', 'age': 22},
#  {'name': 'Diana', 'age': 31},
#  {'name': 'Ethan', 'age': 45},
#  {'name': 'Fiona', 'age': 27},
#  {'name': 'George', 'age': 38},
#  {'name': 'Hannah', 'age': 25},
#  {'name': 'Ian', 'age': 29},
#  {'name': 'Julia', 'age': 33}]

## Very large CSV file

In [None]:
import csv
import random
import time


def count_rows_above_threshold(filename, threshold=100):
    counter = 0
    with open(filename, mode='r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if float(row['amount']) > threshold:
                counter += 1

    return counter


def generate_test_csv(filename: str, num_rows=1000000):
    print(f"Generating {filename} with {num_rows:,} rows...")

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write header
        writer.writerow(['id', 'name', 'amount', 'category', 'date'])

        # Generate random data
        categories = ['Food', 'Entertainment', 'Shopping', 'Transportation', 'Utilities']

        for i in range(1, num_rows + 1):
            # Generate amounts with different distributions
            if random.random() < 0.3:  # 30% chance of high amounts
                amount = round(random.uniform(100, 1000), 2)
            else:  # 70% chance of lower amounts
                amount = round(random.uniform(1, 150), 2)

            row = [
                i,
                f"User_{i}",
                amount,
                random.choice(categories),
                f"2025-07-{random.randint(1, 31):02d}"
            ]
            writer.writerow(row)

    print(f"Generated {filename} successfully!")


file_path = 'data/huge_csv.csv'
size = 1000000

# Must be uncommented to create to test file
# generate_test_csv(file, size)

print("\nBenchmarking solutions:")
print("-" * 50)

print(f"\nTesting {file_path} ({size:,} rows):")

# Test your implementation
start_time = time.time()
try:
    result = count_rows_above_threshold(file_path)
    end_time = time.time()
    print(f"  Your solution: {result} rows > 100 ({end_time - start_time:.4f}s)")
except Exception as e:
    print(f"  Your solution: Error - {e}")

# Date & Time

| Format String | Meaning          | Example |
| ------------- | ---------------- | ------- |
| `%Y`          | Year (4 digits)  | `2025`  |
| `%m`          | Month (2 digits) | `06`    |
| `%d`          | Day              | `24`    |
| `%H`          | Hour (24h)       | `19`    |
| `%M`          | Minute           | `00`    |
| `%S`          | Second           | `45`    |


In [None]:
from datetime import datetime, timedelta, date, UTC

# Current local time
now = datetime.now()
now

In [None]:
# Current UTC time
utc_now = datetime.now(UTC)
utc_now

In [None]:
# Current date only
today = date.today()
today

## String <> Datetime Conversion

In [None]:
dt = datetime.strptime("2025-06-24 19:00", "%Y-%m-%d %H:%M")
dt

# datetime.datetime(2025, 6, 24, 19, 0)

In [None]:
s = dt.strftime("%Y-%m-%d %H:%M")
s

# '2025-06-24 19:00'

## Add, subtract, time delta

In [None]:
yesterday = today - timedelta(days=1)
tomorrow = today + timedelta(days=1)

In [None]:
delta = timedelta(days=5, hours=0)
new_time = now + delta
new_time

# Data Structures

| **Data Structure**              | **Properties**                                                                                             | **When to Use**                                                                                           |
| ------------------------------- | ---------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- |
| **List** (`[]`)                 | - Ordered  <br> - Mutable  <br> - Allows duplicates  <br> - O(1) append <br> - O(n) search, insert, delete | - Ordered collection of items  <br> - Need to maintain sequence  <br> - Iteration and processing in order |
| **Tuple** (`()`)                | - Ordered <br> - Immutable <br> - Allows duplicates <br> - Faster than list                                | - Fixed collection of items <br> - Keys in dictionaries <br> - Function return of multiple values         |
| **Set** (`{}`)                  | - Unordered <br> - Unique elements <br> - No duplicates <br> - O(1) lookup and add                         | - Removing duplicates <br> - Membership tests <br> - Set operations (union, intersection)                 |
| **Dict** (`{key: value}`)       | - Key-value pairs <br> - Unordered <br> - O(1) lookup by key <br> - Keys must be unique                    | - Fast lookups <br> - Mapping relationships (id → value) <br> - Counting, grouping                        |
| **Deque** (`collections.deque`) | - Double-ended queue <br> - Fast append/pop both ends <br> - O(1) append/pop left and right                | - Queue or stack with fast performance <br> - Sliding windows, BFS                                        |
| **Heap** (`heapq`)              | - Min-heap by default <br> - O(log n) insert and extract <br> - Always access smallest element             | - Priority queues <br> - Scheduling <br> - Top K elements                                                 |

In plain words: <br/>
✅ List → For general ordered sequences <br/>
✅ Tuple → For fixed-size or immutable sequences <br/>
✅ Set → For unique items & fast "is this in?" tests <br/>
✅ Dict → For key-value mappings (extremely common in data engineering) <br/>
✅ Deque → For efficient queue or stack <br/>
✅ Heap → For "always get me the smallest (or largest) item fast"


## Lists

In [None]:
# Create
lst = [1, 2, 3]
lst
# [1, 2, 3]

In [None]:
# Add
lst.append(4)
lst
# [1, 2, 3, 4]

In [None]:
# Insert
lst.insert(1, 10)  # insert new item before index (index, new item)
lst
# [1, 10, 2, 3, 4]

In [None]:
# Remove
lst.remove(4)  # remove the value; ValueError if not found
lst

In [None]:
lst.sort()  # doesn't work if the list has values from different types
lst

In [None]:
lst.reverse()  # sort descending
lst

In [None]:
# comprehension
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]
newlist = [x for x in fruits if "a" in x]
newlist

# ['apple', 'banana', 'mango']

## Dicts

In [None]:
# Create
d = {'a': 1, 'b': 2}
d
# {'a': 1, 'b': 2}

In [None]:
# Add/Update
d['c'] = 3
d
# {'a': 1, 'b': 2, 'c': 3}

In [None]:
# Access
value = d.get('a', 0)  # or just d['a']
value

In [None]:
# Iterate
for k, v in d.items():
    print(k, v)

# a 1
# b 2
# c 3

In [None]:
# Comprehension to generator
keys = (k for k, v in d.items())
print(keys)
for k in keys:
    print(k)

# <generator object <genexpr> at 0x106a04ee0>
# a
# b
# c

In [None]:
# Comprehension to list
keys = [k for k, v in d.items()]
print(keys)
for k in keys:
    print(k)

# ['a', 'b', 'c']
# a
# b
# c

In [None]:
# Dict comprehension
squares = {x: x ** 2 for x in range(5)}
squares

## Sets

In [None]:
# Create
s = {1, 2, 3}
s

# {1, 2, 3}

In [None]:
# Add
s.add(4)
s

# {1, 2, 3, 4}

In [None]:
# Remove
s.discard(2)  # doesn't throw error if not found
s

# {1, 3, 4}

In [None]:
# Set operations
a = {1, 2, 3}
b = {3, 4, 5}
union = a | b
intersection = a & b
difference = a - b

print(union)  # {1, 2, 3, 4, 5}
print(intersection)  # {3}
print(difference)  # {1, 2}

## Tuples



In [None]:
# Create
t = (1, 2, 3)

# Unpack
a, b, c = t

# Useful in dict keys and as function return
# return (min_val, max_val)

In [None]:
# as dict keys
salaries = {}
salaries[('John', 'Smith')] = 10000.0
salaries[('John', 'Parker')] = 99999.0
salaries

# {('John', 'Smith'): 10000.0, ('John', 'Parker'): 99999.0}

In [None]:
for k, v in salaries.items():
    print(k[0], k[1], v)

# John Smith 10000.0
# John Parker 99999.0

## Queue and Stack

In [None]:
# Stack (LIFO) The same as list
stack = []
stack.append(10)
v = stack.pop()
print(v)  # 10
print(stack)  # []

In [None]:
from collections import deque

# Queue (FIFO)
queue = deque()
queue.append(10)
queue.append(20)
queue.append(30)
queue

# deque([10, 20, 30])

In [None]:
queue.popleft()  # 10
queue

# deque([20, 30])

In [None]:
queue.pop()  # 30

# deque([20])

# Data Transformations

## Filtering

In [None]:
young = [x for x in data if x['age'] < 40]
young

#  [{'name': 'Alice', 'age': 28},
#  {'name': 'Charlie', 'age': 22},
#  {'name': 'Fiona', 'age': 27},
#  {'name': 'Hannah', 'age': 25},
#  {'name': 'Ian', 'age': 29}]

## Sorting

In [None]:
young_sorted = sorted(young, key=lambda x: x['age'])
young_sorted

# [{'name': 'Charlie', 'age': 22},
#  {'name': 'Hannah', 'age': 25},
#  {'name': 'Fiona', 'age': 27},
#  {'name': 'Alice', 'age': 28},
#  {'name': 'Ian', 'age': 29},
#  {'name': 'Diana', 'age': 31},
#  {'name': 'Julia', 'age': 33},
#  {'name': 'Bob', 'age': 34},
#  {'name': 'George', 'age': 38}]

## Grouping

In [None]:
def calculate_user_totals(purchase_records):
    total_by_user = {}
    for purchase in purchase_records:
        user_id = purchase['user_id']
        amount = purchase['amount']

        if user_id in total_by_user:
            total_by_user[user_id] += amount
        else:
            total_by_user[user_id] = amount

    return total_by_user


test_data = [
    {"user_id": 42, "amount": 199.90},
    {"user_id": 15, "amount": 75.50},
    {"user_id": 42, "amount": 99.90},
    {"user_id": 88, "amount": 250.00},
    {"user_id": 15, "amount": 74.50},
    {"user_id": 42, "amount": 25.00},
    {"user_id": 99, "amount": 300.75},
    {"user_id": 88, "amount": 89.99},
    {"user_id": 15, "amount": 120.00}
]

result = calculate_user_totals(test_data)
print("Your result:", result)

# Additional test cases
print("\nTesting edge cases:")

# Empty list
empty_result = calculate_user_totals([])
print("Empty list result:", empty_result)

# Single record
single_record = [{"user_id": 1, "amount": 100.0}]
single_result = calculate_user_totals(single_record)
print("Single record result:", single_result)

# Duplicate user_id with same amount
duplicate_test = [
    {"user_id": 10, "amount": 50.0},
    {"user_id": 10, "amount": 50.0}
]
duplicate_result = calculate_user_totals(duplicate_test)
print("Duplicate test result:", duplicate_result)

## Counting

In [None]:
# def count_event_types(logs):
#     logs_by_type_count = {}
#     for log in logs:
#         event = log['event'].lower()
#         if event in logs_by_type_count:
#             logs_by_type_count[event] += 1
#         else:
#             logs_by_type_count[event] = 1
#
#     return logs_by_type_count

# Most Pythonic version
def count_event_types(logs):
    from collections import Counter
    return Counter(log['event'].lower() for log in logs)


# Test data to practice with
test_logs = [
    {"timestamp": "2025-07-05T10:00:00Z", "event": "click", "user_id": 1},
    {"timestamp": "2025-07-05T10:05:00Z", "event": "purchase", "user_id": 1},
    {"timestamp": "2025-07-05T10:10:00Z", "event": "view", "user_id": 2},
    {"timestamp": "2025-07-05T10:15:00Z", "event": "click", "user_id": 2},
    {"timestamp": "2025-07-05T10:20:00Z", "event": "click", "user_id": 3},
    {"timestamp": "2025-07-05T10:25:00Z", "event": "view", "user_id": 1},
    {"timestamp": "2025-07-05T10:30:00Z", "event": "purchase", "user_id": 3},
    {"timestamp": "2025-07-05T10:35:00Z", "event": "view", "user_id": 4},
    {"timestamp": "2025-07-05T10:40:00Z", "event": "click", "user_id": 4},
    {"timestamp": "2025-07-05T10:45:00Z", "event": "logout", "user_id": 2},
    {"timestamp": "2025-07-05T10:50:00Z", "event": "login", "user_id": 5},
    {"timestamp": "2025-07-05T10:55:00Z", "event": "view", "user_id": 5}
]

# Expected result for test_logs:
# {"click": 4, "purchase": 2, "view": 4, "logout": 1, "login": 1}

# Test your function
if __name__ == "__main__":
    result = count_event_types(test_logs)
    print("Your result:", result)

    # Additional test cases
    print("\nTesting edge cases:")

    # Empty list
    empty_result = count_event_types([])
    print("Empty list result:", empty_result)

    # Single event type
    single_event = [
        {"timestamp": "2025-07-05T10:00:00Z", "event": "click", "user_id": 1},
        {"timestamp": "2025-07-05T10:05:00Z", "event": "click", "user_id": 2}
    ]
    single_result = count_event_types(single_event)
    print("Single event type result:", single_result)

    # All different event types
    different_events = [
        {"timestamp": "2025-07-05T10:00:00Z", "event": "signup", "user_id": 1},
        {"timestamp": "2025-07-05T10:05:00Z", "event": "activation", "user_id": 2},
        {"timestamp": "2025-07-05T10:10:00Z", "event": "conversion", "user_id": 3}
    ]
    different_result = count_event_types(different_events)
    print("All different events result:", different_result)

    # Case sensitivity test
    case_test = [
        {"timestamp": "2025-07-05T10:00:00Z", "event": "Click", "user_id": 1},
        {"timestamp": "2025-07-05T10:05:00Z", "event": "CLICK", "user_id": 2},
        {"timestamp": "2025-07-05T10:10:00Z", "event": "click", "user_id": 3}
    ]
    case_result = count_event_types(case_test)
    print("Case sensitivity test result:", case_result)