## Project HARDy: 
File generation from yNot module
    * DH 2020-04-20
    
Generating large quantities of math-based files to compare with the rest of our project. At the simplest level, our data classifier should certianly be able to distinguish between Linear and exponential or sinusoidal data. 

ynot.py has a variety of functions to generate data, and so we'll loop through those to make large datasets. I figure that it may make more sense to work in a jupyter notebook to make the files, so that's what this will be for...

In [2]:
import os
import os.path
import time
import datetime

import numpy as np
import pandas as pd

import ynot

In [3]:
"""
Figure out file navigation, to a local_data folder where we will dump files.
"""
local_folder = "../local_data/"
check_path = os.path.isdir(local_folder)

if not check_path:
    os.mkdir(local_folder)

In [6]:
"""
Generate folder structure
"""
print(datetime.datetime.now())

today = datetime.datetime.now()
datestr = str(today.year) + '-' + str(today.month) +'-'+ str(today.day)

i=0
save_folder = datestr + '_' + str(i).zfill(4)
while os.path.isdir(os.path.join(local_folder,save_folder)):
    i += 1
    save_folder = datestr + '_' + str(i).zfill(4)
else:
    os.mkdir(os.path.join(local_folder,save_folder))
    
save_as = os.path.join(local_folder,save_folder)
print("Saving files in:   " + str(save_as))

2020-04-24 16:07:47.433930
Saving files in:   ../local_data/2020-4-24_0001


In [10]:
"""

"""
Category_A = "linear/"
Category_B = "sin/"
A_folder = os.path.join(save_as, Category_A)
B_folder = os.path.join(save_as, Category_B)

length = 256 # Points per file
n_files = 500

In [11]:
# "Linear Data"
i=0
size=0
timing = time.perf_counter()
n_loop = int(np.sqrt(n_files))
if not os.path.exists(A_folder):
    os.mkdir(A_folder)
for m in np.linspace(-1,10,n_loop):
    for b in np.linspace(-1,2,n_loop):
        data_save = A_folder + datestr +'_linear_'+ str(i).zfill(4)+ '.csv'
        data_frame = ynot.generate_linear(length=length, m=m, b=b)
        data_frame.to_csv(data_save, index=False)
        size += os.path.getsize(data_save)
        i+=1
        
if size > 1000000000:
    sizestr = str(size/1000000000) + ' GBytes! WOW!'
elif size > 100000:
    sizestr = str(size/1000000) + ' MBytes'
elif size > 1000:
    sizestr = str(size/1000) + ' kBytes'
else:
    sizestr = str(size) + ' bytes'
    
print('finished! made '+ str(i) + ' files...' )  
print('All told we used  '+ sizestr)
timing = time.perf_counter()-timing
print('took ' + str(timing)[0:5] + ' seconds!')

finished! made 484 files...
All told we used  4.791942 MBytes
took 2.854 seconds!


In [14]:
# "sin() Data"
i=0
size=0
timing = time.perf_counter()

n_loops = int(n_files**0.333)
if not os.path.exists(B_folder):
    os.mkdir(B_folder)
for A in np.linspace(0.1,5,n_loops):
    for f in np.linspace(1,10,n_loops):
        for th in np.linspace(0,1,n_loops):
            data_save = B_folder + datestr +'_linear_'+ str(i).zfill(4)+ '.csv'
            data_frame = ynot.generate_sin(length=length,A=A, f=f, theta=th)
            data_frame.to_csv(data_save, index=False)
            size += os.path.getsize(data_save)
            i+=1
            
if size > 1000000000:
    sizestr = str(size/1000000000) + ' GBytes! WOW!'
elif size > 100000:
    sizestr = str(size/1000000) + ' MBytes'
elif size > 1000:
    sizestr = str(size/1000) + ' kBytes'
else:
    sizestr = str(size) + ' bytes'
    
print('finished! made '+ str(i) + ' files...' )  
print('All told we used  '+ sizestr)
timing = time.perf_counter()-timing
print('took ' + str(timing)[0:5] + ' seconds!')

finished! made 343 files...
All told we used  3.459619 MBytes
took 1.723 seconds!


'0010'