## Dataset default generation 

1. Imports
2. Load linear regression parameters from json file
3. Generate dataset using obtained params and default optimization level (float64) 


In [1]:
import sys
import os
# Adding dgpy package to python path, otherwise it will fail. You don't have to do that if it's installed as a package
sys.path.append(os.path.join(os.getcwd(), "../"))

import dgpy
from dgpy.model.lr_params import LR_Params
from dgpy.persistence.json_loader import Json_Params_Loader
from dgpy.persistence.csv_saver import Csv_Results_Saver 
from dgpy.generator.lr_generator import LR_Generator, LR_Chunk_Generator

filename = os.path.join(os.getcwd(), "data_tests/params.json")

j_loader = Json_Params_Loader(filename)
params = j_loader.load('lr') # if you don't specify param_type will load with param_type='lr' default option  

# print
print("filename: {}".format(filename))
attrs = vars(params)
print("".join("%s: %s\n" % item for item in attrs.items()))

generator = LR_Generator()
results = generator.generate_dataset(params, verbose=True, opt_level='float64')

# print results
print("features:\n{}\n".format(results.features))
print("response:\n{}\n".format(results.response))
print("correlation matrix:\n{}\n".format(results.corr_matrix))
print("covariance matrix:\n{}\n".format(results.cov_matrix))
print("user: {}\n".format(results.gen_details.user))
print("os name: {}".format(results.gen_details.platform_name))

filename: C:\Users\Carlo\Documents\IngegneriaInformatica\Tesi_Generatore_Dati\notebooks\data_tests/params.json
seed: None
n_variables: 5
error_variance: 5
n_points: 50
related_vars: 3
n_scales: 1
requested_mean: 0
means: [2.2, -2.3, 0.5, 0.5, 0.8]
betas: 0   -0.8
1   -1.7
2   -1.8
3   -1.2
4    0.4
5    1.0
dtype: float64

Start time: 2020-03-09 15:10:17.770758
End time: 2020-03-09 15:10:17.788231
Generating process time: 0:00:00.017473
Total memory usage of [Y, X]: 2.59 KB

features:
           0         1         2         3         4
0   3.464297 -6.121162  1.757608  4.011335 -3.262073
1   2.675088 -1.865316 -4.412174  6.617649 -1.286290
2   3.900886  0.670990  3.578012  3.878957  1.718781
3  -0.086602 -5.031485  2.630794 -0.333440  1.125096
4   2.292395 -2.683920  2.250003  5.369378 -0.821339
5   0.701874  1.660772  0.628826  1.455202 -0.211543
6   1.877915 -5.105923 -2.308188 -0.016930  4.355617
7   3.011670  0.069840  0.834158 -2.447608  0.712216
8   1.840013 -7.944728 -0.395366 

## Generating results with different optimization levels

We have to do the same steps of default dataset generation, except for choosing optimization levels
+ "float64" ----> default optimization level, you don't have to specify it
+ "float32" ----> data will be downcasted to float32, half of default memory usage
+ "float16" ----> data will be downcasted to float16, one quarter of default memory usage
### --- Choose the correct optimization level for you necessities ---

## 1. Float32

In [2]:
import sys
import os
# Adding dgpy package to python path, otherwise it will fail. You don't have to do that if it's installed as a package
sys.path.append(os.path.join(os.getcwd(), "../"))

import dgpy
from dgpy.model.lr_params import LR_Params
from dgpy.persistence.json_loader import Json_Params_Loader
from dgpy.persistence.csv_saver import Csv_Results_Saver 
from dgpy.generator.lr_generator import LR_Generator, LR_Chunk_Generator

filename = os.path.join(os.getcwd(), "data_tests/params.json")

j_loader = Json_Params_Loader(filename)
params = j_loader.load('lr') # if you don't specify param_type will load with param_type='lr' default option  

# print
print("filename: {}".format(filename))
attrs = vars(params)
print("".join("%s: %s\n" % item for item in attrs.items()))

generator = LR_Generator()
results = generator.generate_dataset(params, verbose=True, opt_level='float32')

# print results
print("features:\n{}\n".format(results.features))
print("response:\n{}\n".format(results.response))
print("correlation matrix:\n{}\n".format(results.corr_matrix))
print("covariance matrix:\n{}\n".format(results.cov_matrix))
print("user: {}\n".format(results.gen_details.user))
print("os name: {}".format(results.gen_details.platform_name))

filename: C:\Users\Carlo\Documents\IngegneriaInformatica\Tesi_Generatore_Dati\notebooks\data_tests/params.json
seed: None
n_variables: 5
error_variance: 5
n_points: 50
related_vars: 3
n_scales: 1
requested_mean: 0
means: [-0.9, 0.9, 0.6, -2.9, 2.0]
betas: 0    1.23
1    1.50
2    0.50
3    1.70
4   -0.50
5   -1.40
dtype: float64

Generating dataset with optimization level: float32
Start time: 2020-03-09 15:13:34.359579
End time: 2020-03-09 15:13:34.376533
Generating process time: 0:00:00.016954
Total memory usage of [Y, X]: 1.42 KB

features:
           0         1         2         3         4
0   3.412931 -1.201188 -3.552259  3.600067  5.401393
1   1.654899 -1.299228  1.828603  1.753605  4.419460
2  -2.017637  6.595312  0.476216 -5.194803  6.991974
3   2.774656 -0.282298 -3.406054  1.520403 -0.026855
4  -3.944570  2.343744 -0.838563 -1.134253  2.186585
5   0.881280  4.164274 -0.341908 -5.449080  2.408399
6  -4.961142  0.888144  1.252955  1.671307  0.916314
7  -3.120556 -1.506905 -6.1

## 2. Float16

In [3]:
import sys
import os
# Adding dgpy package to python path, otherwise it will fail. You don't have to do that if it's installed as a package
sys.path.append(os.path.join(os.getcwd(), "../"))

import dgpy
from dgpy.model.lr_params import LR_Params
from dgpy.persistence.json_loader import Json_Params_Loader
from dgpy.persistence.csv_saver import Csv_Results_Saver 
from dgpy.generator.lr_generator import LR_Generator, LR_Chunk_Generator

filename = os.path.join(os.getcwd(), "data_tests/params.json")

j_loader = Json_Params_Loader(filename)
params = j_loader.load('lr') # if you don't specify param_type will load with param_type='lr' default option  

# print
print("filename: {}".format(filename))
attrs = vars(params)
print("".join("%s: %s\n" % item for item in attrs.items()))

generator = LR_Generator()
results = generator.generate_dataset(params, verbose=True, opt_level='float16')

# print results
print("features:\n{}\n".format(results.features))
print("response:\n{}\n".format(results.response))
print("correlation matrix:\n{}\n".format(results.corr_matrix))
print("covariance matrix:\n{}\n".format(results.cov_matrix))
print("user: {}\n".format(results.gen_details.user))
print("os name: {}".format(results.gen_details.platform_name))

filename: c:\Users\Carlo\Documents\IngegneriaInformatica\Tesi_Generatore_Dati\notebooks\data_tests/params.json
seed: None
n_variables: 5
error_variance: 5
n_points: 50
related_vars: 3
n_scales: 1
requested_mean: 0
means: [-0.1, 2.5, -2.4, 2.2, -2.7]
betas: 0   -11.48
1     1.60
2     2.00
3    -0.60
4     0.40
5    -1.60
dtype: float64

Generating dataset with optimization level: float16
Start time: 2020-03-09 15:29:27.371249
End time: 2020-03-09 15:29:27.376236
Generating process time: 0:00:00.004987
Total memory usage of [Y, X]: 856.0 Bytes

features:
           0         1         2         3         4
0  -4.199219  3.900391 -1.745117  3.511719 -6.558594
1  -0.898438 -4.035156 -1.220703  7.789062 -2.712891
2   1.518555  4.101562 -2.783203  1.250977 -1.000977
3   3.097656  1.233398  0.399170  0.680176  0.265381
4  -1.769531  2.527344 -0.754395  2.970703 -1.031250
5   2.867188  2.599609 -2.972656  1.819336 -1.839844
6   5.464844  1.203125  0.385498 -2.351562 -6.941406
7   1.525391  2.