#### Import Parent Module

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from jupyter_notebook import load_parameters

In [2]:
import pandas as pd
import os


#### Read Luigi Parameters

In [None]:
pars = load_parameters()

input_path = pars.get('input')
output_file = pars.get('output')

input_col = pars.get('input_col')
output_col = pars.get('output_col')

In [None]:
dataset = pd.read_csv(input_path, encoding='ISO-8859-1', error_bad_lines=False, sep=";")
print('Amount of entries in the dataset = %d' % len(dataset))

#### Process Terminal-Like Code
The goal of this function is to remove tags related ot notebook or terminal-like codes

Example:
```python
In [11]: df
df.append()
Out[11]: 
A B C D 
2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
2000-01-04 0.83935 0.15993 0.95911 -1.12959

In [12]: df[(df.values > 1.5).any(1)]
Out[12]: 
A B C D 
2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
2000-01-06 0.7167 -0.2671 1.36029 1.7425
```

Should extract only:
```python
df
df.append()
df[(df.values > 1.5).any(1)]
```


In [2]:
import re

def preprocessTerminalLikeCode(code):
    terminalRegex='In \[.+?\]:(.*?)Out\[.+?\]'
    sequence = re.compile(terminalRegex, re.DOTALL)
    terminalCode = sequence.findall(code)
    return [i.lstrip() for i in terminalCode]

#### Remove any non-parseable python line

Goal of this function is to infer valid python code from its parseability.

For Example, in the following code:
```python
df
df.append()

A B C D 
2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
2000-01-04 0.83935 0.15993 0.95911 -1.12959

df[(df.values > 1.5).any(1)]
 
A B C D 
2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
2000-01-06 0.7167 -0.2671 1.36029 1.7425
```

The function extracts only the valid python code lines:
```python
df
df.append()
df[(df.values > 1.5).any(1)]
```

In [3]:
import ast

def preprocessOutputCode(code):
    lines = code.split('\n')
    preprocessed = []
    for l in lines:
        try:
            ast.parse(l)
            preprocessed.append(l)
        except:
            # Ignore
            pass
    return '\n'.join(preprocessed)
            


#### Pre-process the dataset

Use the above mentioned methods to preprocess the dataset on the column ```input_col``` and output the dataframe on the column ```output_col```


In [4]:
import ast

def preprocessCode(code):
    # First Method for pre-processing
    terminalLikeCode = preprocessTerminalLikeCode(code)
    if terminalLikeCode:
        code = ''.join(terminalLikeCode)
        
    code = preprocessOutputCode(code)
    return code
    
def preprocessPost(post, col):
    strcodes = post[col]
    # Get the list of codes
    codes = ast.literal_eval(strcodes)
    processed = []
    for code in codes:
        processed.append(preprocessCode(code))
    post[output_col] = processed
    return post


processed_dataset = dataset.apply(preprocessPost, axis=1, args=(input_col,))

In [None]:
processed_dataset.to_csv(output_file, encoding='ISO-8859-1', sep=';', doublequote=True, index=False)