## Lib for Map-Reduce

In [3]:
# from typing import Iterable, Callable
from itertools import groupby
from collections.abc import Iterable, Callable


def flatten[T](inp: Iterable[Iterable[T]]) -> Iterable[T]:
    for it in inp:
        for item in it:
            yield item


def run_map[T, U](mapper: Callable[T, Iterable[U]], input_stream: Iterable[T]):
    return flatten(map(mapper, input_stream))


def run_reduce[T, U](reducer: Callable[Iterable[T], U],
                     input_stream: Iterable[T], 
                     key: [str]) -> Iterable[U]:
    def key_func(item):
        return tuple(getattr(item, k) for k in key)
    
    sorted_stream = sorted(input_stream, key=key_func)
    grouped_stream = groupby(sorted_stream, key=key_func)
    return flatten(map(lambda x: reducer(x[1]), grouped_stream))


class SimpleMapReduce:
    def __init__(self, stream):
        self._stream = stream

    def map(self, mapper):
        self._stream = run_map(mapper, self._stream)
        return self

    def reduce(self, reducer, key):
        self._stream = run_reduce(reducer, self._stream, key)
        return self

    def output(self):
        return self._stream

SyntaxError: expected '(' (1646027934.py, line 5)

In [None]:
from dataclasses import dataclass
import datetime

## DAU поиска с MapReduce на коленке

In [1]:
@dataclass
class UserEvent:
    user_id: str
    moment: datetime.datetime
    action: str
    value: float

@dataclass
class UserDate:
    user_id: str
    date: datetime.date

@dataclass
class DateDAU:
    date: datetime.date
    dau: int

def parse_user_event(line: str) -> DateDAU:
    row = line.strip().split('\t')
    if row[0] != 'userid':
        yield UserEvent(
            user_id=row[0],
            moment=datetime.datetime.fromisoformat(row[1]),
            action=row[2],
            value=float(row[3]),
        )
        
def user_event_to_user_date(event: UserEvent) -> Iterable[UserDate]:
    yield UserDate(
        user_id=event.user_id,
        date=event.moment.date(),
    )
    
def passive_sort_by_key(inp: Iterable[UserDate]) -> Iterable[UserDate]:
    for ud in inp:
        yield ud
        break
        
def count_users_by_date(inp: Iterable[UserDate]) -> Iterable[DateDAU]:
    count = 0
    date = None
    for ud in inp:
        date = ud.date
        count += 1
    assert date is not None
    
    yield DateDAU(date=date, dau=count)


def process(mrjob: SimpleMapReduce) -> SimpleMapReduce:
     return mrjob.map(parse_user_event)

NameError: name 'dataclass' is not defined

In [None]:
print()

## Testing

In [None]:
with open("log.tsv", "r") as input_stream:
    mrjob = process(SimpleMapReduce(input_stream))
    for item in mrjob.output():
        print(item.month, item.dau)