In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from jupyter_notebook import load_parameters 

In [None]:
import pandas as pd

pars = load_parameters()

input_file = pars.get('input')
output_file = pars.get('output')

input_col = pars.get('input_col')
output_col = pars.get('output_col')

debug = pars.get('debug')

In [None]:
dataset = pd.read_pickle(input_file)

#### Remove any non-parseable python line

Goal of this function is to infer valid python code from its parseability.

For Example, in the following code:
```python
df
df.append()

A B C D 
2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
2000-01-04 0.83935 0.15993 0.95911 -1.12959

df[(df.values > 1.5).any(1)]
 
A B C D 
2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
2000-01-06 0.7167 -0.2671 1.36029 1.7425
```

The function extracts only the valid python code lines:
```python
df
df.append()
df[(df.values > 1.5).any(1)]
```

In [None]:
import ast
import os

def preprocessParseableCodeInteractively(code):
    processed = ''
    not_compilable = True
    
    # Just a safe measure to avoid unexpected long loops
    safe_count = len(code.split(os.linesep)) * 3
    count = 0
    
    while not_compilable and count < safe_count:
        try:
            ast.parse(code)
            processed = code
            not_compilable = False
        except SyntaxError as e:
            # Get the line of the syntax error
            errorstm_idx = e.lineno - 1
            code_stms = code.split(os.linesep)
            if errorstm_idx >= 0 and errorstm_idx < len(code_stms):     
                # Remove the problematic code line
                del(code_stms[errorstm_idx])
                code = os.linesep.join(code_stms)
            else:
                not_compilable = False
            pass
        count += 1
    return processed

In [None]:
# test ="""
# df
# df.append()

# A B C D
# 2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
# 2000-01-04 0.83935 0.15993 0.95911 -1.12959

# df[(df.values > 1.5).any(1)]

# A B C D
# 2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
# 2000-01-06 0.7167 -0.2671 1.36029 1.7425
# """

# preprocessParseableCode(test.replace('\n', os.linesep))

In [None]:
# test2 = """
# Date,Open,High,Low,Close,Volume,Adj Close
# 2011-10-19,27.37,27.47,27.01,27.13,42880000,27.13
# 2011-10-18,26.94,27.40,26.80,27.31,52487900,27.31
# 2011-10-17,27.11,27.42,26.85,26.98,39433400,26.98
# 2011-10-14,27.31,27.50,27.02,27.27,50947700,27.27

# ....

# #!/usr/bin/env python
# from pandas import *
# df = read_csv('table.csv')

# for i, row in enumerate(df.values):
#     date = df.index[i]
#     open, high, low, close, adjclose = row
#     #now perform analysis on open/close based on date, etc..

# """

# print(preprocessParseableCode(test2.replace('\n', os.linesep)))

In [None]:
# test3 = """bigdata = data1.append(data2)
# Exception: Index cannot contain duplicate values!
#     meta  particle  ratio   area    type    
# 0   2     part10    1.348   0.8365  touching
# 1   2     part18    1.558   0.8244  single  
# 2   2     part2     1.893   0.894   single  
# 3   2     part37    0.6695  1.005   single  
# ....clip...
# 36  2     part23    1.051   0.8781  single  
# 37  2     part3     80.54   0.9714  nuclei  
# 38  2     part34    1.071   0.9337  single  
#     meta  particle  ratio    area    type    
# 0   3     part10    0.4756   1.025   single  
# 1   3     part18    0.04387  1.232   dusts   
# 2   3     part2     1.132    0.8927  single  
# ...clip...
# 46  3     part46    13.71    1.001   nuclei  
# 47  3     part3     0.7439   0.9038  single  
# 48  3     part34    0.4349   0.9956  single 
# """

# preprocessParseableCodeInteractively(test3.replace('\n', os.linesep))

In [None]:
def removeBlankLines(fullcode):
    lines = [line for line in fullcode.split(os.linesep) if line.strip() != '']
    return os.linesep.join(lines)


In [None]:
def preprocessPost(post):
    fullcode = str(post[input_col])
    code = removeBlankLines(fullcode)
    post[output_col] =  preprocessParseableCodeInteractively(code)
    return post

processed_dataset = dataset.apply(preprocessPost, axis=1)

In [None]:
processed_dataset.to_pickle(output_file)

if debug:
    processed_dataset.to_csv(output_file + ".csv", encoding='ISO-8859-1', sep=",", doublequote=True, index=False)