## Task 1. Генерация бинарного файла

Бинарный файл состоящий из одного 32-разрядного числа весит 4 байта, т.к. 32 / 8 = 4. 

1Гб = 2^30 байт. 

Для того, чтобы получить файл размером в 2 Гб нужно 2^29 чисел.

In [1]:
import os
import sys
sys.path.insert(0, "/home/dmitry/Desktop/BigData/")

from multiprocessing import Pool
import mmap

import numpy as np
from src.utils.timer import timer

In [2]:
file_name = "binary_uint32.txt"
array_size = 2**29

def get_numbers_array(highest_number: int, array_size: int) -> np.ndarray:
    return np.random.randint(
        low=0,
        high=highest_number,
        size=array_size,
        dtype=np.dtype('uint32')
    ).newbyteorder("big")


def generate_file(file_name: str, array_size: int = 20, bit_depth: int = 32) -> None:
    numbers = get_numbers_array(2**bit_depth, array_size=array_size)
    with open(file_name, 'wb') as f:
        f.write(numbers.tobytes())


generate_file(file_name=file_name, array_size=array_size)

## Task 2. Найти сумму этих чисел, минимальное и максимальное число.

### Простое последовательное чтение 

In [3]:
def counter(array: np.ndarray) -> dict:
    sum_numbers = 0
    max_number = -float("inf")
    min_number = float("inf")

    for elem in array:
        sum_numbers += elem
        if elem > max_number:
            max_number = elem
        if elem < min_number:
            min_number = elem
    
    return {
        "sum_numbers": sum_numbers, 
        "max_number": max_number,
        "min_number": min_number,
    }

def counter_numpy(array: np.ndarray) -> dict:   
    return {
        "sum_numbers": np.sum(array, dtype="int64"), 
        "max_number": array.max(),
        "min_number": array.min(),
    }

In [4]:
@timer
def base_reading(file_name: str) -> dict:
    with open(file_name, "rb") as file:      
        buffer = file.read()
        tmp_array = np.frombuffer(buffer, dtype = np.dtype('uint32').newbyteorder('big'))

    return counter(tmp_array)

In [5]:
@timer
def base_reading_numpy(file_name: str) -> dict:
    with open(file_name, "rb") as file:      
        buffer = file.read()
        tmp_array = np.frombuffer(buffer, dtype = np.dtype('uint32').newbyteorder('big'))

    return counter_numpy(tmp_array)

## С использованием memory-mapped files

In [6]:
@timer
def mm_reading(file_name: str) -> dict:
    with open(file_name, "r+b") as file:
        mm = mmap.mmap(file.fileno(), length=array_size * 4, offset=0, access=mmap.ACCESS_READ)
        tmp_array = np.frombuffer(mm, dtype = np.dtype('uint32').newbyteorder('big'))

    return counter(tmp_array)

In [7]:
@timer
def mm_reading_numpy(file_name: str) -> dict:
    with open(file_name, "r+b") as file:
        file_size = os.path.getsize(file_name)
        mm = mmap.mmap(file.fileno(), length=file_size, offset=0, access=mmap.ACCESS_READ)
        tmp_array = np.frombuffer(mm, dtype = np.dtype('uint32').newbyteorder('big'))

    return counter_numpy(tmp_array)

## + С использованием многопоточности

In [8]:
@timer
def mm_reading_multithread(file_name: str, num_processes: int = 4) -> dict:
    with open(file_name, "r+b") as file:
        file_size = os.path.getsize(file_name)
        mm = mmap.mmap(file.fileno(), length=file_size, offset=0, access=mmap.ACCESS_READ)
        tmp_array = np.frombuffer(mm, dtype=np.dtype('uint32').newbyteorder('big'))

    chunks = np.array_split(tmp_array, num_processes)

    with Pool(processes=num_processes) as pool:
        results = pool.map(counter, chunks)

    sum_numbers = sum(result["sum_numbers"] for result in results)
    max_number = max(result["max_number"] for result in results)
    min_number = min(result["min_number"] for result in results)

    return {
        "sum_numbers": sum_numbers, 
        "max_number": max_number,
        "min_number": min_number,
    }

In [9]:
@timer
def mm_reading_multithread_numpy(file_name: str, num_processes: int = 4) -> dict:
    with open(file_name, "r+b") as file:
        file_size = os.path.getsize(file_name)
        mm = mmap.mmap(file.fileno(), length=file_size, offset=0, access=mmap.ACCESS_READ)
        tmp_array = np.frombuffer(mm, dtype=np.dtype('uint32').newbyteorder('big'))

    chunks = np.array_split(tmp_array, num_processes)

    with Pool(processes=num_processes) as pool:
        results = pool.map(counter_numpy, chunks)

    sum_numbers = sum(result["sum_numbers"] for result in results)
    max_number = max(result["max_number"] for result in results)
    min_number = min(result["min_number"] for result in results)

    return {
        "sum_numbers": sum_numbers, 
        "max_number": max_number,
        "min_number": min_number,
    }

## Сравнение результатов

Простое последовательное чтение

In [10]:
base_reading("binary_uint32.txt")

base_reading took 170.5219 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}

memory-mapped files без numpy

In [11]:
mm_reading("binary_uint32.txt")

mm_reading took 163.4505 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}

memory-mapped files + многопоточность без numpy

In [12]:
mm_reading_multithread("binary_uint32.txt")

mm_reading_multithread took 68.2823 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}

Последовательное чтение с использованием numpy

In [13]:
base_reading_numpy("binary_uint32.txt")

base_reading_numpy took 7.1793 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}

memory-mapped files c numpy

In [14]:
mm_reading_numpy("binary_uint32.txt")

mm_reading_numpy took 3.8582 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}

memory-mapped files + многопоточность с numpy

In [15]:
mm_reading_multithread_numpy("binary_uint32.txt")

mm_reading_multithread_numpy took 15.9251 secs


{'sum_numbers': 1152860411457095218, 'max_number': 4294967287, 'min_number': 6}