# End-to-End Data Cleaning Pipeline with Raha and Baran (Minimal and Sequential)
We build an end-to-end data cleaning pipeline with our configuration-free error detection and correction systems, Raha and Baran.

In [1]:
import pandas
import IPython.display

import raha

## Error Detection with Raha

### 1. Instantiating the Detection Class
We first instantiate the `Detection` class.

In [2]:
app_1 = raha.dask_version.detection_parallel.Detection()

# How many tuples would you label?
app_1.LABELING_BUDGET = 20

# Would you like to see the logs?
app_1.VERBOSE = True

### 2. Instantiating the Dataset
We next load and instantiate the dataset object.

In [3]:
dataset_dictionary = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
d = app_1.initialize_dataset(dataset_dictionary)
d.dataframe.head()

Unnamed: 0,f_name,l_name,gender,area_code,phone,city,state,zip,marital_status,has_child,salary,rate,single_exemp,married_exemp,child_exemp
0,Pengyuan,Zendler,F,508,744-9007,SWAMPSCOTT,MA,1907,M,N,90000,5.3,0,7150,0
1,Nik,Tacic,M,702,517-7658,LAS VEGAS,NV,89140,M,N,90000,0.0,0,0,0
2,Hovav,Punter,M,501,304-9763,HASTY,AR,72640,S,N,50000,7.0,20,0,0
3,Xiangning,Vanneste,F,862,651-6469,BRIGANTINE,NJ,8203,M,Y,55000,1.9519792,0,2000,1500
4,Belen,Niccum,F,920,287-1889,FLORENCE,WI,54121,S,Y,85000,5.9232907,700,0,400


### 3. Running Error Detection Strategies
Raha runs (all or the promising) error detection strategies on the dataset. This step could take a while because all the strategies should be run on the dataset. 

In [4]:
app_1.run_strategies(d)

3743 cells are detected by ["PVD", ["l_name", "E"]].
27473 cells are detected by ["PVD", ["city", "K"]].
3786 cells are detected by ["PVD", ["single_exemp", "8"]].
199813 cells are detected by ["PVD", ["rate", "."]].
4347 cells are detected by ["PVD", ["l_name", "V"]].
27718 cells are detected by ["PVD", ["f_name", "g"]].

11069 cells are detected by ["PVD", ["l_name", "p"]].107027 cells are detected by ["PVD", ["phone", "8"]].
47015 cells are detected by ["PVD", ["city", "D"]].
19366 cells are detected by ["PVD", ["state", "O"]].
4223 cells are detected by ["PVD", ["f_name", "V"]].
23531 cells are detected by ["PVD", ["single_exemp", "2"]].
400000 cells are detected by ["RVD", ["has_child", "l_name"]].
13616 cells are detected by ["PVD", ["single_exemp", "7"]].
9260 cells are detected by ["RVD", ["phone", "salary"]].
113770 cells are detected by ["PVD", ["f_name", "a"]].
399600 cells are detected by ["RVD", ["child_exemp", "married_exemp"]].
400000 cells are detected by ["RVD", ["mari

Process ForkPoolWorker-21:
Process ForkPoolWorker-24:
Process ForkPoolWorker-23:
Process ForkPoolWorker-20:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-11:
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._arg

KeyboardInterrupt: 

  File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 365, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


### 4. Generating Features
Raha then generates a feature vector for each data cell based on the output of error detection strategies. 

In [None]:
app_1.generate_features(d)

### 5. Building Clusters
Raha next builds a hierarchical clustering model for our clustering-based sampling approach.

In [None]:
app_1.build_clusters(d)

### 6. Interactive Tuple Sampling and Labeling
Raha then iteratively samples a tuple. We should label data cells of each sampled tuple.

In [None]:
while len(d.labeled_tuples) < app_1.LABELING_BUDGET:
    app_1.sample_tuple(d)
    if d.has_ground_truth:
        app_1.label_with_ground_truth(d)
    else:
        print("Label the dirty cells in the following sampled tuple.")
        sampled_tuple = pandas.DataFrame(data=[d.dataframe.iloc[d.sampled_tuple, :]], columns=d.dataframe.columns)
        IPython.display.display(sampled_tuple)
        for j in range(d.dataframe.shape[1]):
            cell = (d.sampled_tuple, j)
            value = d.dataframe.iloc[cell]
            correction = input("What is the correction for value '{}'? Type in the same value if it is not erronous.\n".format(value))
            user_label = 1 if value != correction else 0
            d.labeled_cells[cell] = [user_label, correction]
        d.labeled_tuples[d.sampled_tuple] = 1

### 7. Propagating User Labels
Raha then propagates each user label through its cluster.

In [None]:
app_1.propagate_labels(d)

### 8. Predicting Labels of Data Cells
Raha then trains and applies one classifier per data column to predict the label of the rest of data cells.

In [None]:
app_1.predict_labels(d)

### 9. Storing Results
Raha can also store the error detection results.

In [None]:
app_1.store_results(d)

### 10. Evaluating the Error Detection Task
We can finally evaluate our error detection task.

In [None]:
p, r, f = d.get_data_cleaning_evaluation(d.detected_cells)[:3]
print("Raha's performance on {}:\nPrecision = {:.2f}\nRecall = {:.2f}\nF1 = {:.2f}".format(d.name, p, r, f))

# Error Correction with Baran

### 1. Instantiating the Correction Class
We first instantiate the `Correction` class.

In [None]:
app_2 = raha.dask_version.correction_parallel.Correction()

# How many tuples would you label?
app_2.LABELING_BUDGET = 20

# Would you like to see the logs?
app_2.VERBOSE = True

### 2. Initializing the Dataset Object
We next initialize the dataset object.

In [None]:
d = app_2.initialize_dataset(d)
d.dataframe.head()

### 3. Initializing the Error Corrector Models
Baran initializes the error corrector models.

In [None]:
app_2.initialize_models(d)

### 4. Interactive Tuple Sampling, Labeling, Model updating, Feature Generation, and Correction Prediction
Baran then iteratively samples a tuple. We should label data cells of each sampled tuple. It then udpates the models accordingly and generates a feature vector for each pair of a data error and a correction candidate. Finally, it trains and applies a classifier to each data column to predict the final correction of each data error. Since we already labeled tuples for Raha, we use the same labeled tuples and do not label new tuples here.

In [None]:
# while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
#     app_2.sample_tuple(d)
#     if d.has_ground_truth:
#         app_2.label_with_ground_truth(d)
#     else:
#         print("Label the dirty cells in the following sampled tuple.")
#         sampled_tuple = pandas.DataFrame(data=[d.dataframe.iloc[d.sampled_tuple, :]], columns=d.dataframe.columns)
#         IPython.display.display(sampled_tuple)
#         for j in range(d.dataframe.shape[1]):
#             cell = (d.sampled_tuple, j)
#             value = d.dataframe.iloc[cell]
#             correction = input("What is the correction for value '{}'? Type in the same value if it is not erronous.\n".format(value))
#             user_label = 1 if value != correction else 0
#             d.labeled_cells[cell] = [user_label, correction]
#         d.labeled_tuples[d.sampled_tuple] = 1
#     app_2.update_models(d)
#     app_2.predict_corrections(d)

for si in d.labeled_tuples:
    d.sampled_tuple = si
    app_2.update_models(d)
    app_2.predict_corrections(d)

### 5. Storing Results
Baran can also store the error correction results.

In [None]:
app_2.store_results(d)

### 6. Evaluating the Error Correction Task
We can finally evaluate our error correction task.

In [None]:
p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
print("Baran's performance on {}:\nPrecision = {:.2f}\nRecall = {:.2f}\nF1 = {:.2f}".format(d.name, p, r, f))