## Dataset generation by chunk

1. Imports
2. Load linear regression parameters from json file
3. Generate dataset with choosen chunks (number of iterations)


## --- Notes ---
+ You may not assign a variable to the generate_dataset() method, because it doesn't mantain anything in memory
+ You must specify also a path and a saver (possibly an implementation of Result_Saver) that will save generated files in disk
+ Choose a right amount of chunks to balance memory and cpu usage 


## 1. Chunks = 50
+ json params has n_points = 50, so 50 iterations will generate 1 tuple at a time
+ The process will take low memory, but high cpu usage and generation time (see generation verbose output for details)

In [None]:
import sys
import os
# Adding dgpy package to python path, otherwise it will fail. You don't have to do that if it's installed as a package
sys.path.append(os.path.join(os.getcwd(), "../"))

import dgpy
from dgpy.model.lr_params import LR_Params
from dgpy.persistence.json_loader import Json_Params_Loader
from dgpy.persistence.csv_saver import Csv_Results_Saver 
from dgpy.generator.lr_generator import LR_Generator, LR_Chunk_Generator

filename = os.path.join(os.getcwd(), "data_tests/params.json")
dirpath = os.path.join(os.getcwd(), "data_tests_chunk")

j_loader = Json_Params_Loader(filename)
params = j_loader.load('lr') # if you don't specify param_type will load with param_type='lr' default option  

# print
print("filename: {}".format(filename))
attrs = vars(params)
print("".join("%s: %s\n" % item for item in attrs.items()))

generator = LR_Chunk_Generator()
saver = Csv_Results_Saver(dirpath)
chunks = 50
generator.generate_dataset(params, saver, chunks=chunks, verbose=True)
# you can specify other generation parameters, see "dataset_generation.ipynb" for optimization levels and "saving_results.ipynb" for compression types

# go to dir path to see results

## 2. Chunks = 1
+ json params has n_points = 50, so 1 iterations will generate 50 tuple at a time
+ The process will take high memory, but low cpu usage and generation time (see generation verbose output for details)

In [None]:
import sys
import os
# Adding dgpy package to python path, otherwise it will fail. You don't have to do that if it's installed as a package
sys.path.append(os.path.join(os.getcwd(), "../"))

import dgpy
from dgpy.model.lr_params import LR_Params
from dgpy.persistence.json_loader import Json_Params_Loader
from dgpy.persistence.csv_saver import Csv_Results_Saver 
from dgpy.generator.lr_generator import LR_Generator, LR_Chunk_Generator

filename = os.path.join(os.getcwd(), "data_tests/params.json")
dirpath = os.path.join(os.getcwd(), "data_tests_chunk")

j_loader = Json_Params_Loader(filename)
params = j_loader.load('lr') # if you don't specify param_type will load with param_type='lr' default option  

# print
print("filename: {}".format(filename))
attrs = vars(params)
print("".join("%s: %s\n" % item for item in attrs.items()))

generator = LR_Chunk_Generator()
saver = Csv_Results_Saver(dirpath)
chunks = 1
generator.generate_dataset(params, saver, chunks=chunks, verbose=True)
# you can specify other generation parameters, see "dataset_generation.ipynb" for optimization levels and "saving_results.ipynb" for compression types

# go to dir path to see results