# Processing of unpacked ENC Data

Assumed directory structure:
```
.
├── 00_raw_data
├── 01_unpacked_data
├── 02_processed_data
├── 03_marked_for_qa
├── 04_qa_reviewed
├── 05_final_for_export
├── 01_unpacking.ipynb
├── 02_processing.ipynb
├── 02_standardize.ipynb
├── 03_deduplicate.ipynb
├── 04_postprocessing.ipynb
└── ...
```

In [None]:
data_file_name = "data.csv" # Name of raw data file without "unpacked_" prefix
bucket_name = "BUCKETNAME"

## imports

In [None]:
import pandas as pd
from itables import show
import re

## fixed parameter

In [None]:
file_path = "01_unpacked_data"
upload_path = "02_processed_data"

## Init Google I/O

In [None]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)

## Load Data

In [None]:
unpacked_blob_name = f"{file_path}/unpacked_{data_file_name}"

In [None]:
bucket.blob(unpacked_blob_name).download_to_filename(unpacked_blob_name)
print(f"Downloaded {unpacked_blob_name}")

In [None]:
unpacked_data = pd.read_csv(unpacked_blob_name,sep="|",index_col=0)

## Pre-process Data

In [None]:
show(unpacked_data)

### Process Names

In [None]:
from aroa_etl.enc.processing import process_unpacked_data

In [None]:
processed_data = process_unpacked_data(unpacked_data,
                                       skip_columns=[
                                           'updated_at',
                                           'user_id',
                                           'workflow_id', 
                                           'created_at', 
                                           'document_id', 
                                           'id'                                           
                                       ],)

In [None]:
show(processed_data)

## Upload unpacked data

In [None]:
processed_file_name = f"{upload_path}/normalised_enc_{data_file_name}"

In [None]:
processed_data.to_csv(processed_file_name,sep="|")

In [None]:
blob = bucket.blob(processed_file_name)
blob.upload_from_filename(processed_file_name)