# Notes:

This notebook computes a large number of data quality metrics between files in the generated data folder and the real dataset

# Set parameters

In [1]:
# path to real dataset
real_data_loc = "real_data/final_df--czech.csv"

# path to folder containing generated dataset
gen_data_folder = "generated_data/"  

# path to folder for saving results objects
results_folder = "results/"

# Setup

In [2]:
import os
import time


from mylib.metrics import compute_all_metrics

# Setup dfs

In [3]:
start_time  = time.time()
new_time = start_time

In [4]:
os.listdir(gen_data_folder)

['.DS_Store',
 'bf.csv',
 'dg.csv',
 'tf-v.csv',
 '.ipynb_checkpoints',
 'bf-nc.csv',
 'tg.csv',
 'bf-nd.csv']

In [5]:
if results_folder.replace("/", "")  in os.listdir():
    display(os.listdir(results_folder))
else:
    os.mkdir(results_folder)

['.DS_Store']

In [6]:
import pandas as pd

In [7]:
failed_on = []

for fname in sorted(os.listdir(gen_data_folder)):
    
    if not ".csv" in fname:
        continue 
        
    result_fname = fname.replace(".csv", ".result")
    print()
        
    if result_fname in os.listdir(results_folder):
        print("Skipping, result already exists for:", result_fname)
        continue
    
    
    print("Begining to process:", fname)
    
    try:

        gen_data_loc = os.path.join(gen_data_folder, fname)
        result_loc = os.path.join(results_folder, result_fname)

        full_result = compute_all_metrics(real_data_loc, gen_data_loc)

        full_result.save(result_loc)
        
        
    except Exception as e:
        failed_on.append((fname, e))
        print(f"Failed to update: {fname}. ({e})", )
    
    prev_time = new_time 
    new_time = time.time()
    print(f"Took {new_time - start_time: .3f} secs since notebook start. ({new_time - prev_time: .3f} since last update) ")


Begining to process: bf-nc.csv
Took  628.879 secs since notebook start. ( 628.879 since last update) 

Begining to process: bf-nd.csv
Took  1327.797 secs since notebook start. ( 698.918 since last update) 

Begining to process: bf.csv
Took  1910.508 secs since notebook start. ( 582.711 since last update) 

Begining to process: dg.csv
Took  2645.860 secs since notebook start. ( 735.352 since last update) 

Begining to process: tf-v.csv
Took  3372.008 secs since notebook start. ( 726.148 since last update) 

Begining to process: tg.csv
Took  4061.203 secs since notebook start. ( 689.195 since last update) 
