# ИД23-1 Маслов АН



## Пример 1. Построить заданную иерархию групп и добавить метаинформацию


In [92]:
import rand_csv
import multiprocessing as mp
import random
import pandas as pd
import os
import h5py
import numpy as np

In [93]:
with h5py.File('test.h5', 'w') as f:

    # Создаём иерархию групп
    grp_exp = f.create_group('Test')
    grp_run1 = grp_exp.create_group('Run1')
    grp_run2 = grp_exp.create_group('Run2')

    dt1 = np.random.rand(100, 100)
    dt2 = np.arange(1000).reshape(100, 10)

    dset1 = grp_run1.create_dataset('Data1', data=dt1)
    dset2 = grp_run2.create_dataset('Data2', data=dt2)
    f.visititems(print)

    # Добавление метаинформации
    f.attrs['author'] = 'Маслов А. Н.'
    f.attrs['description'] = 'Тест'
    grp_run1.attrs['time'] = 33.2
    dset1.attrs['process'] = 'two'

Test <HDF5 group "/Test" (2 members)>
Test/Run1 <HDF5 group "/Test/Run1" (1 members)>
Test/Run1/Data1 <HDF5 dataset "Data1": shape (100, 100), type "<f8">
Test/Run2 <HDF5 group "/Test/Run2" (1 members)>
Test/Run2/Data2 <HDF5 dataset "Data2": shape (100, 10), type "<i8">


## Пример 2. Найти в иерархии групп датасет с указаными свойствами

In [94]:

def find_dataset_with_attr(h5obj, attr_name, attr_value):
    '''Рекурсивный поиск датасета по атрибуту 
       принимает объект h5py (файл или группу), имя атрибута и его значение
       возвращает список путей к найденным датасетам
    '''
    found = []
    
    # Вспомогательная функция для рекурсивного обхода
    def visit_func(name, obj):
        if isinstance(obj, h5py.Dataset) and attr_name in obj.attrs:
            if obj.attrs[attr_name] == attr_value:
                found.append(name)
    h5obj.visititems(visit_func)
    return found

with h5py.File('test.h5', 'r') as f:
    results = find_dataset_with_attr(f, 'process', 'two')
    print("Найденные датасеты с unit='process':", results)

Найденные датасеты с unit='process': ['Test/Run1/Data1']


## Пример 3. Оптимизировать хранение данных с помощью блоков (chunk).

In [95]:
with h5py.File('test_compress.h5', 'w') as f:
    data = np.random.rand(1000, 1000)
    
    # создание датасета с рабитием на чанки и gzip-сжатие
    dset = f.create_dataset(
        'BigData',
        data=data,
        chunks=(100, 100),   # разбиваем на блоки 100x100
        compression='gzip',
        compression_opts=9   # уровень сжатия
    )

In [107]:
with h5py.File('test.h5', 'w') as f:
    data = np.random.rand(1000, 1000)
     # создание датасета без чанков и сжатия
    dset = f.create_dataset('BigData', data=data)

## Пример 4. Создать сжатые файлов из существующего, проверив размеры

In [108]:
size_orig = os.path.getsize('test.h5') 
size_comp = os.path.getsize('test_compress.h5')
print(f"Размер обычного файла: {size_orig}")
print(f"Размер сжатого файла: {size_comp}")

Размер обычного файла: 8002048
Размер сжатого файла: 7554701


## Пример 5. Запись двух DataFrame в один hdf5 файл с последующим считыванием и сравнением размеров

In [98]:
%%file rand_csv.py
import random
import time

#генератор случайной даты и числа
def rand_csv(length, pos, time_start, time_end, output): 
    csv_rows = []
    for i in range(length):
        stime = time.mktime(time.strptime(time_start, '%m/%d/%Y %I:%M %p'))
        etime = time.mktime(time.strptime(time_end, '%m/%d/%Y %I:%M %p'))
        ptime = stime + random.random() * (etime - stime)
        csv_rows.append([time.strftime('%y%m%d%H%M%S', time.localtime(ptime)), random.randint(1, 9999999), pos])
    output.put(csv_rows)

Overwriting rand_csv.py


In [99]:


output = mp.Queue()

#cоздаем процессы
processes = [mp.Process(target=rand_csv.rand_csv, args=(random.randint(50000,500000), x, "1/1/2008 1:30 PM", "1/1/2025 4:50 AM", output)) for x in range(4)] #создаем 4 процесса
for p in processes:
    p.start()

results = [output.get() for p in processes] #сохраняем результат

for p in processes:
    p.join()

pandas_result = []
for i in results:
    pandas_result = pandas_result + i #форматируем для удобства

#формируем датафрейм
resultdata = pd.DataFrame(pandas_result, columns=["datetime", "rand_number", "processor_number"]) #создаем датафрейм
resultdata['datetime'] = pd.to_datetime(resultdata['datetime'], format='%y%m%d%H%M%S')
resultdata = resultdata.sort_values('datetime') #сортируем по дате
resultdata.to_csv("result_data.csv")
resultdata


Unnamed: 0,datetime,rand_number,processor_number
387003,2008-01-01 13:30:47,3642956,3
275703,2008-01-01 13:31:19,5767002,2
497574,2008-01-01 13:44:33,6169858,3
615809,2008-01-01 13:46:25,2767291,3
641381,2008-01-01 13:47:47,2997874,3
...,...,...,...
704834,2025-01-01 03:19:12,751868,3
168548,2025-01-01 03:20:50,4310959,2
171204,2025-01-01 03:30:14,4665905,2
474538,2025-01-01 03:36:47,7665799,3


In [100]:
another_resultdata = pd.read_csv("recipes_sample.csv")
another_resultdata

Unnamed: 0,name,id,minutes,contributor_id,submitted,n_steps,description,n_ingredients
0,george s at the cove black bean soup,44123,90,35193,2002-10-25,,an original recipe created by chef scott meska...,18.0
1,healthy for them yogurt popsicles,67664,10,91970,2003-07-26,,my children and their friends ask for my homem...,
2,i can t believe it s spinach,38798,30,1533,2002-08-29,,"these were so go, it surprised even me.",8.0
3,italian gut busters,35173,45,22724,2002-07-27,,my sister-in-law made these for us at a family...,
4,love is in the air beef fondue sauces,84797,25,4470,2004-02-23,4.0,i think a fondue is a very romantic casual din...,
...,...,...,...,...,...,...,...,...
29995,zurie s holey rustic olive and cheddar bread,267661,80,200862,2007-11-25,16.0,this is based on a french recipe but i changed...,10.0
29996,zwetschgenkuchen bavarian plum cake,386977,240,177443,2009-08-24,,"this is a traditional fresh plum cake, thought...",11.0
29997,zwiebelkuchen southwest german onion cake,103312,75,161745,2004-11-03,,this is a traditional late summer early fall s...,
29998,zydeco soup,486161,60,227978,2012-08-29,,this is a delicious soup that i originally fou...,


In [101]:
#запись двух DataFrame в hdf5 файл с разными ключами, уровнем сжатия 9 и использование библиотеки bzip2 для сжатия
resultdata.to_hdf('data.h5', key='df', mode='w', complevel=9, complib='bzip2') 
another_resultdata.to_hdf('data.h5', key="df2", mode='a', complevel=9, complib='bzip2')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['name', 'submitted', 'description'], dtype='object')]

  another_resultdata.to_hdf('data.h5', key="df2", mode='a', complevel=9, complib='bzip2')


In [102]:
pd.read_hdf('data.hdf5', 'df') 

Unnamed: 0,datetime,rand_number,processor_number
686228,2008-01-01 13:32:59,4894857,2
654779,2008-01-01 13:42:56,9603697,2
876053,2008-01-01 13:43:18,7365188,2
104317,2008-01-01 13:44:48,4714268,3
875416,2008-01-01 13:44:54,7579588,2
...,...,...,...
1029136,2025-01-01 04:25:17,6559749,2
675882,2025-01-01 04:29:19,8318919,2
247334,2025-01-01 04:41:44,5327785,0
475190,2025-01-01 04:43:06,2161706,0


In [103]:
pd.read_hdf('data.h5', 'df2') 

Unnamed: 0,name,id,minutes,contributor_id,submitted,n_steps,description,n_ingredients
0,george s at the cove black bean soup,44123,90,35193,2002-10-25,,an original recipe created by chef scott meska...,18.0
1,healthy for them yogurt popsicles,67664,10,91970,2003-07-26,,my children and their friends ask for my homem...,
2,i can t believe it s spinach,38798,30,1533,2002-08-29,,"these were so go, it surprised even me.",8.0
3,italian gut busters,35173,45,22724,2002-07-27,,my sister-in-law made these for us at a family...,
4,love is in the air beef fondue sauces,84797,25,4470,2004-02-23,4.0,i think a fondue is a very romantic casual din...,
...,...,...,...,...,...,...,...,...
29995,zurie s holey rustic olive and cheddar bread,267661,80,200862,2007-11-25,16.0,this is based on a french recipe but i changed...,10.0
29996,zwetschgenkuchen bavarian plum cake,386977,240,177443,2009-08-24,,"this is a traditional fresh plum cake, thought...",11.0
29997,zwiebelkuchen southwest german onion cake,103312,75,161745,2004-11-03,,this is a traditional late summer early fall s...,
29998,zydeco soup,486161,60,227978,2012-08-29,,this is a delicious soup that i originally fou...,


In [104]:
print("Размер суммы файлов csv: ")
print(os.path.getsize('result_data.csv') + os.path.getsize('recipes_sample.csv'))
print("Размер одного файла h5: ")
print(os.path.getsize('data.h5'))

Размер суммы файлов csv: 
36538971
Размер одного файла h5: 
19128557
