# Runs a Deduplication Routine for the ... ENC Job

Assumed directory structure:
```
.
├── 00_raw_data
├── 01_unpacked_data
├── 02_processed_data
├── 03_marked_for_qa
├── 04_qa_reviewed
├── 05_final_for_export
├── 01_unpacking.ipynb
├── 02_processing.ipynb
├── 02_standardize.ipynb
├── 03_deduplicate.ipynb
├── 04_postprocessing.ipynb
└── ...
```

kanban link: 
...

In [None]:
data_file_name = "data.csv" # Name of raw data file without "normalised_enc_" prefix
bucket_name = "enc-bucket_name"

## Imports

In [None]:
import pandas as pd
from itables import show

In [None]:
from aroa_etl.enc.deduplication import ENC_Deduplicater

## Load Data

In [None]:
file_path = "02_processed_data"
upload_path = "03_marked_for_qa"

In [None]:
from google.cloud import storage

In [None]:
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [None]:
processed_blob_name = f"{file_path}/normalised_enc_{data_file_name}"
blob = bucket.blob(processed_blob_name)
# Download the file to a destination
blob.download_to_filename(processed_blob_name)

In [None]:
processed_df = pd.read_csv(processed_blob_name,sep='|')
show(processed_df)

## Define the Deduplication Routine

In [None]:
person_cols = [ ... ]
date_cols = [ ... ]
other_cols = [ ... ]
other_strict_cols = [ ... ]

In [None]:
deduplicater = ENC_Deduplicater(
    processed_df,
    "subject_ids",
    metadata_columns=[ "workflow_id", .. ], # columns to be copied from "raw enc" rows to the new deduplication result rows 
) 
deduplicater.on_person_cols(person_cols)
deduplicater.on_date_cols(date_cols)
deduplicater.on_other_cols(other_cols)  
deduplicater.on_other_strict_cols(other_strict_cols)  

## Execute Job

In [None]:
deduplication_result = deduplicater.run()
show(deduplication_result)

## Save and upload Result

In [None]:
deduplicated_blob_name = f"{upload_path}/marked_enc_{data_file_name}"

In [None]:
deduplication_result.to_pickle(deduplicated_blob_name.replace(".csv",".pickle"))
deduplication_result.to_csv(deduplicated_blob_name,sep="|")

In [None]:
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [None]:
blob = bucket.blob(deduplicated_blob_name)
blob.upload_from_filename(deduplicated_blob_name)

## Debug Result

In [None]:
# show(deduplicater.matcher.stats())
deduplicater.matcher.stats_chart()

In [None]:
# In case to few columns are displayed
# import  itables.options as opt
# opt.maxBytes = "10MB"

In [None]:
show(deduplicater.matcher.show_unmatched())

In [None]:
show(deduplicater.matcher.show_matched())