### Create Dataset

In [1]:
import os
os.chdir("..")
os.chdir("src/preprocessing")

In [2]:
from concurrent import futures
from argparse import ArgumentParser
import logging
from tqdm import tqdm
import glob
import pandas as pd
import pickle

from deepsvg.svglib.svg import SVG
from deepsvg.difflib.tensor import SVGTensor

In [3]:
def save_tensor_as_pkl(tensor_data, file_path):
    tensor_dict = {'tensors': tensor_data, 'fillings': [1]} # * tensor_data.shape[0]}
    output = open(file_path, 'wb')
    pickle.dump(tensor_dict, output)
    output.close()

In [11]:
def preprocess_svg(svg_file, output_folder, tensor_folder, meta_data, tensor_data):
    filename = os.path.splitext(os.path.basename(svg_file))[0]

    svg = SVG.load_svg(svg_file)
    svg.fill_(False)
    svg.normalize()
    svg.zoom(0.9)
    svg.canonicalize()
    svg = svg.simplify_heuristic()

    svg.save_svg(os.path.join(output_folder, f"{filename}.svg"))
    
    save_tensor_as_pkl(svg.to_tensor(), os.path.join(tensor_folder, f"{filename}.pkl"))

    len_groups = [path_group.total_len() for path_group in svg.svg_path_groups]

    meta_data[filename] = {
        "id": filename,
        "total_len": sum(len_groups),
        "nb_groups": len(len_groups),
        "len_groups": len_groups,
        "max_len_group": max(len_groups)
    }
    
    tensor_data[filename] = {
        "id": filename,
        "tensor": svg.to_tensor()
    }

In [12]:
def main(args):
    with futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
        svg_files = glob.glob(os.path.join(args.data_folder, "*.svg"))
        meta_data = {}
        tensor_data = {}

        with tqdm(total=len(svg_files)) as pbar:
            preprocess_requests = [executor.submit(preprocess_svg, svg_file, args.output_folder, args.tensor_folder, meta_data, tensor_data)
                                    for svg_file in svg_files]

            for _ in futures.as_completed(preprocess_requests):
                pbar.update(1)

    df = pd.DataFrame(meta_data.values())
    df.to_csv(args.output_meta_file, index=False)
    
    df_tensor = pd.DataFrame(tensor_data.values())
    df_tensor.to_csv('data/svg_tensor_data.csv', index=False)

    logging.info("SVG Preprocessing complete.")

In [13]:
logging.basicConfig(level=logging.INFO)

parser = ArgumentParser()
parser.add_argument("--data_folder", default=os.path.join("data", "svgs"))
parser.add_argument("--output_folder", default=os.path.join("data", "svgs_simplified"))
parser.add_argument("--tensor_folder", default=os.path.join("data", "svgs_tensors"))
parser.add_argument("--output_meta_file", default=os.path.join("data", "svg_meta.csv"))
parser.add_argument("--workers", default=4, type=int)

_StoreAction(option_strings=['--workers'], dest='workers', nargs=None, const=None, default=4, type=<class 'int'>, choices=None, help=None, metavar=None)

In [14]:
args = parser.parse_args("--data_folder data/svgs/ --output_folder data/svgs_simplified/ --tensor_folder data/svgs_tensors/ --output_meta_file data/svg_meta.csv".split())

In [15]:
if not os.path.exists(args.output_folder): os.makedirs(args.output_folder)

In [16]:
if not os.path.exists(args.tensor_folder): os.makedirs(args.tensor_folder)

In [17]:
main(args)

100%|████████████████████████████████████████████████████████████████████████████████| 419/419 [00:47<00:00,  8.91it/s]
INFO:root:SVG Preprocessing complete.


In [11]:
#svg = SVG.load_svg('data/svgs/Bayer.svg')

In [12]:
#SVGTensor.from_data(svg.to_tensor())

In [13]:
#save_tensor_as_pkl(SVGTensor.from_data(svg.to_tensor()), 'test.pkl')

In [6]:
with open('./data/svgs_tensors/Alfi.pkl', 'rb') as f:
    data = pickle.load(f)

data

{'tensors': tensor([[ 0.0000, -1.0000, -1.0000,  ..., -1.0000, 11.2039,  6.1470],
         [ 1.0000, -1.0000, -1.0000,  ..., -1.0000, 11.2039,  6.1470],
         [ 2.0000, -1.0000, -1.0000,  ...,  6.4180, 16.5470,  7.0207],
         ...,
         [ 2.0000, -1.0000, -1.0000,  ..., 10.8654,  7.0355, 10.8319],
         [ 2.0000, -1.0000, -1.0000,  ..., 11.7787,  7.5396, 12.2521],
         [ 1.0000, -1.0000, -1.0000,  ..., -1.0000,  7.5396, 12.2521]]),
 'fillings': [1]}

In [13]:
pd.DataFrame(data['tensors'].numpy()).drop([1, 2, 3, 4, 5], axis=1)

Unnamed: 0,0,6,7,8,9,10,11,12,13
0,0.0,0.000000,0.000000,-1.000000,-1.000000,-1.000000,-1.000000,11.203851,6.146964
1,1.0,11.203851,6.146964,-1.000000,-1.000000,-1.000000,-1.000000,11.203851,6.146964
2,2.0,11.203851,6.146964,12.840675,6.225605,14.800165,6.418000,16.546961,7.020661
3,2.0,16.546961,7.020661,18.293758,7.623322,19.827860,8.636251,20.613907,10.355960
4,2.0,20.613907,10.355960,21.813976,12.981462,20.365112,14.844210,18.178247,16.056341
...,...,...,...,...,...,...,...,...,...
91,1.0,7.539608,12.252069,-1.000000,-1.000000,-1.000000,-1.000000,7.539608,12.252069
92,2.0,7.539608,12.252069,7.408699,12.598938,7.369522,12.996221,7.146880,13.292677
93,2.0,7.146880,13.292677,5.869326,14.993793,5.320243,10.865399,7.035508,10.831860
94,2.0,7.035508,10.831860,7.537752,10.822039,7.371574,11.778666,7.539608,12.252069
