# Unpacking of Raw ENC Data

Assumed directory structure:
```
.
├── 00_raw_data
├── 01_unpacked_data
├── 02_processed_data
├── 03_marked_for_qa
├── 04_qa_reviewed
├── 05_final_for_export
├── 01_unpacking.ipynb
├── 02_processing.ipynb
├── 02_standardize.ipynb
├── 03_deduplicate.ipynb
├── 04_postprocessing.ipynb
└── ...
```

In [None]:
data_file_name = "data.csv" # Name of raw data file without "raw_" prefix
bucket_name = "enc-bucket_name" #TODO

## Imports

In [None]:
import pandas as pd
from itables import show
import re

## fixed parameters

In [None]:
file_path = "00_raw_data"
upload_path = "01_unpacked_data"

## Init GCloud I/O

In [None]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)

## Download Raw Data

In [None]:
raw_blob_name = f"{file_path}/raw_{data_file_name}"

In [None]:
bucket.blob(raw_blob_name).download_to_filename(raw_blob_name)
print(f"Downloaded {raw_blob_name}")

In [None]:
raw_data = pd.read_csv(raw_blob_name)

## Unpack

In [None]:
from aroa_etl.enc.unpacking import unpack

In [None]:
show(raw_data)

In [None]:
unpacked_data = unpack(raw_data,"json_data", additional_splits_on=lambda col: re.search(r"(category)",col), split_re=r"[\|;,\s]")

In [None]:
show(unpacked_data)

## Upload unpacked data

In [None]:
unpacked_file_name = f"{upload_path}/unpacked_{data_file_name}"

In [None]:
unpacked_data.to_csv(unpacked_file_name,sep="|")

In [None]:
blob = bucket.blob(unpacked_file_name)
blob.upload_from_filename(unpacked_file_name)