In [None]:
import os
import pandas as pd
import glob
import tempfile
from pathlib import Path

#### Provide storage account parameters here

###### storage_conn_string "Storage account connection string"
###### src_container "Container where data is stored"
###### dst_container "Container where results should be uploaded"

In [None]:
storage_conn_string = ""
src_container = ""
dst_container = ""

# Import functions from other notebooks

In [None]:
%run "Data-utils.ipynb"

In [None]:
%run "FR-Utils.ipynb"

In [None]:
%run "file-utils.ipynb"

In [None]:
%run "AzureBlobStorageLib.ipynb" storage_conn_string src_container dst_container

# Data Preparation

#### Steps include

1. Downlaoding data from Blob Storage
2. Converting all format files to PDF files
3. Splitting multipage PDF to single page PDF files

In [None]:
# Create temporary directory and download files

#temp_dir = tempfile.TemporaryDirectory()
#data_dir = temp_dir.name

data_dir = "../data/"
if os.path.exists(data_dir) :
    shutil.rmtree(data_dir)
Path(data_dir).mkdir(parents=True, exist_ok=True)

In [None]:
raw_files = os.path.join(data_dir,"rawFiles")
Path(raw_files).mkdir(parents=True, exist_ok=True)

download2local(raw_files)

In [None]:
# convert all file types to pdf and then split to 1-p docs
raw_pdf = os.path.join(data_dir,"allPdf")
Path(raw_pdf).mkdir(parents=True, exist_ok=True)
convert2pdf(src = raw_files, pdfdst = raw_pdf)
print("Input files are stored at:", raw_pdf)

pdf_1p = os.path.join(data_dir,"1p-pdf")
Path(pdf_1p).mkdir(parents=True, exist_ok=True)
pdf_split(src = raw_pdf, dst = pdf_1p)
print("Processed files are stored at:", pdf_1p)

#### Get initial Parameters

In [None]:
#get initial file count
fls = glob.glob(os.path.join(pdf_1p,"*.pdf"))
initial_file_cnt = len(fls)
print(initial_file_cnt)

#results directory
results_dir = os.path.join(data_dir,"Results")
Path(results_dir).mkdir(parents=True, exist_ok=True)

In [None]:
# generate SAS signature for storage container
sas_url = fr_get_sas_url(dst_container)
#sas_url = ""
sas_url 

In [None]:
# Call below function, if you want to clean up the destination blob container
#deleteContainerData()

# FR Template identification process

In [None]:
iteration = 1

while(iteration):
    
    ########################################################
    # create directory for current iteration
    ########################################################
    iter_fld = "I"+str(iteration)
    iter_dir = os.path.join(results_dir,iter_fld)
    Path(iter_dir).mkdir(parents=True, exist_ok=True)

    ########################################################
    # sample files for training
    ########################################################
    train_fld = "trainset"
    train_dir = os.path.join(iter_dir,train_fld)
    Path(train_dir).mkdir(parents=True, exist_ok=True)
    sample_training_data(src_fld = pdf_1p, dst_fld = train_dir)

    ########################################################
    # upload the files to blob
    ########################################################
    blob_path = os.path.join(iter_fld,train_fld)
    upload2blob(local_path = train_dir, container_path = blob_path)
    
    ########################################################
    #train FR unsupervised model
    ########################################################
    model_file = iter_fld+"-model-details.json"
    train_fr_model(sas_url = sas_url, folder_path = blob_path.replace("\\", "/"), model_file = model_file)

    ########################################################
    #if model is created infer using the model
    ########################################################
    iter_model_file = os.path.join(iter_dir, model_file)
    if os.path.exists(model_file):
        shutil.copyfile(model_file, iter_model_file)

    infer_fld = "fr-json"
    infer_dir = os.path.join(iter_dir,infer_fld)
    Path(infer_dir).mkdir(parents=True, exist_ok=True)

    #Start FR inferencing
    fr_model_inference(src_dir = pdf_1p, json_dir = infer_dir, model_file = iter_model_file, thread_cnt = 10)

    ########################################################
    # Segregate files to clusters
    ########################################################
    clust_dir = os.path.join(results_dir,"clusters")
    Path(clust_dir).mkdir(parents=True, exist_ok=True)

    cluster_file = os.path.join(iter_dir, iter_fld+"-clusters.csv")

    files_clustered = segregate_data(src_dir = pdf_1p, result_dir= infer_dir, cluster_dir = clust_dir, 
                                     prefix = iter_fld, cluster_file = cluster_file)

    print("Identified clusters for:", files_clustered, "files")

    ########################################################
    # Upload iteration results to blob storage
    ########################################################
    
    upload2blob(local_path = iter_dir, container_path = iter_fld)  #train data, model details and clusters
    upload2blob(local_path = clust_dir, container_path = "clusters")  #Files segregated into clusters
    
    ########################################################
    # decide on next iteration
    ########################################################
    moved_percent = files_clustered * 100 / initial_file_cnt

    if (moved_percent < 5) | (initial_file_cnt < 500):
        iteration = 0
    else:
        iteration = iteration + 1
