In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from jupyter_notebook import load_parameters 

In [None]:
import pandas as pd

pars = load_parameters()

input_file = pars.get('input')
output_file = pars.get('output')

input_col = pars.get('input_col')
output_col = pars.get('output_col')

debug = pars.get('debug')

In [8]:
dataset = pd.read_pickle(input_file)

#### Remove any non-parseable python line

Goal of this function is to infer valid python code from its parseability.

For Example, in the following code:
```python
df
df.append()

A B C D 
2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
2000-01-04 0.83935 0.15993 0.95911 -1.12959

df[(df.values > 1.5).any(1)]
 
A B C D 
2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
2000-01-06 0.7167 -0.2671 1.36029 1.7425
```

The function extracts only the valid python code lines:
```python
df
df.append()
df[(df.values > 1.5).any(1)]
```

In [46]:
import ast
import os

def preprocessParseableCodeInteractively(code):
    processed = ''
    not_compilable = True
    
    # Just a safe measure to avoid unexpected long loops
    safe_count = len(code.split(os.linesep)) * 3
    count = 0
    
    while not_compilable and count < safe_count:
        try:
#             print('\n\n----- My Code\n %s' % code)
            ast.parse(code)
            processed = code
            not_compilable = False
        except SyntaxError as e:
            # Get the line of the syntax error
            errorstm_idx = e.lineno - 1
            code_stms = code.split(os.linesep)
#             print('Error at \n' + e.text)
#             print('Index returned \n' + str(e.lineno - 1))
            if errorstm_idx >= 0 and errorstm_idx < len(code_stms):     
                # Remove the problematic code line
#                 print('\n\nRemoving Line\n' + code_stms[errorstm_idx])
                del(code_stms[errorstm_idx])
                code = os.linesep.join(code_stms)
            else:
#                 print('Index Out = ' + str(errorstm_idx))
#                 print('Code Stms' + ''.join(code_stms))
                not_compilable = False
            pass
        count += 1
#     print('\n\n---------- DONE')
    return processed
            
    


In [86]:
# test ="""
# df
# df.append()

# A B C D
# 2000-01-03 -0.59885 -0.18141 -0.68828 -0.77572
# 2000-01-04 0.83935 0.15993 0.95911 -1.12959

# df[(df.values > 1.5).any(1)]

# A B C D
# 2000-01-05 2.8021 -0.1086 -1.62114 -0.2017
# 2000-01-06 0.7167 -0.2671 1.36029 1.7425
# """

# preprocessParseableCode(test.replace('\n', os.linesep))

'\r\ndf\r\ndf.append()\r\n\r\n\r\ndf[(df.values > 1.5).any(1)]\r\n\r\n'

In [87]:
# test2 = """
# Date,Open,High,Low,Close,Volume,Adj Close
# 2011-10-19,27.37,27.47,27.01,27.13,42880000,27.13
# 2011-10-18,26.94,27.40,26.80,27.31,52487900,27.31
# 2011-10-17,27.11,27.42,26.85,26.98,39433400,26.98
# 2011-10-14,27.31,27.50,27.02,27.27,50947700,27.27

# ....

# #!/usr/bin/env python
# from pandas import *
# df = read_csv('table.csv')

# for i, row in enumerate(df.values):
#     date = df.index[i]
#     open, high, low, close, adjclose = row
#     #now perform analysis on open/close based on date, etc..

# """

# print(preprocessParseableCode(test2.replace('\n', os.linesep)))


2011-10-19,27.37,27.47,27.01,27.13,42880000,27.13
2011-10-18,26.94,27.40,26.80,27.31,52487900,27.31
2011-10-17,27.11,27.42,26.85,26.98,39433400,26.98
2011-10-14,27.31,27.50,27.02,27.27,50947700,27.27


#!/usr/bin/env python
from pandas import *
df = read_csv('table.csv')

for i, row in enumerate(df.values):
    date = df.index[i]
    open, high, low, close, adjclose = row
    #now perform analysis on open/close based on date, etc..




In [7]:
# test3 = """bigdata = data1.append(data2)
# Exception: Index cannot contain duplicate values!
#     meta  particle  ratio   area    type    
# 0   2     part10    1.348   0.8365  touching
# 1   2     part18    1.558   0.8244  single  
# 2   2     part2     1.893   0.894   single  
# 3   2     part37    0.6695  1.005   single  
# ....clip...
# 36  2     part23    1.051   0.8781  single  
# 37  2     part3     80.54   0.9714  nuclei  
# 38  2     part34    1.071   0.9337  single  
#     meta  particle  ratio    area    type    
# 0   3     part10    0.4756   1.025   single  
# 1   3     part18    0.04387  1.232   dusts   
# 2   3     part2     1.132    0.8927  single  
# ...clip...
# 46  3     part46    13.71    1.001   nuclei  
# 47  3     part3     0.7439   0.9038  single  
# 48  3     part34    0.4349   0.9956  single 
# """

# preprocessParseableCodeInteractively(test3.replace('\n', os.linesep))

'bigdata = data1.append(data2)\r\n'

In [11]:
def removeBlankLines(fullcode):
    lines = [line for line in fullcode.split(os.linesep) if line.strip() != '']
    return os.linesep.join(lines)

# test3 = """bigdata = data1.append(data2)
# Exception: Index cannot contain duplicate values!
#     meta  particle  ratio   area    type    
# 0   2     part10    1.348   0.8365  touching
# 1   2     part18    1.558   0.8244  single  


# ....clip...
# 36  2     part23    1.051   0.8781  single  




# ...clip...
# """

# print(removeBlankLines(test3.replace('\n', os.linesep)))

bigdata = data1.append(data2)
Exception: Index cannot contain duplicate values!
    meta  particle  ratio   area    type    
0   2     part10    1.348   0.8365  touching
1   2     part18    1.558   0.8244  single  
....clip...
36  2     part23    1.051   0.8781  single  
...clip...


In [15]:
# import pandas as pd
# dataset = pd.read_csv(os.path.join('../../data/stack-overflow/pandas-preprocessedcode-dataset-part2.csv'), encoding='ISO-8859-1', error_bad_lines=False, sep=",")


# input_col = 'PreprocessedCode2'
# output_col = 'PreprocessedCode3'


In [None]:
def preprocessPost(post):
    fullcode = str(post[input_col])
#     print('\n\n----Full Code' + fullcode)
    # Remove blank lines
    code = removeBlankLines(fullcode)
    # code = fullcode
    processed = preprocessParseableCodeInteractively(code)
#     print('\n\n----Processed Code' + processed)
    post[output_col] = processed
    return post

# processed_dataset = dataset[:0].apply(preprocessPost, axis=1)
# dataset[:1].apply(preprocessPost, axis=1)
processed_dataset = dataset.apply(preprocessPost, axis=1)

In [11]:
processed_dataset.to_pickle(output_file)

if debug:
    processed_dataset.to_csv(output_file + ".csv", encoding='ISO-8859-1', sep=",", doublequote=True, index=False)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,...,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount,CodeList,Code,Processed
0,27,27,7779260.0,2.0,<p>When I try to merge two dataframes by rows ...,,7,,2011-10-15T08:21:17.460,4.0,...,601314.0,,1,25,<python><pandas>,append two data frame with pandas,25479.0,"['bigdata = data1.append(data2)\r\r\n', 'Excep...",['bigdata = data1.append(data2)\r\r\nException...,[bigdata = data1.append(data2)\r\r\n]
1,31,31,11617194.0,7.0,<p>I want to perform my own complex operations...,,3,,2011-10-20T14:46:14.633,108.0,...,1005409.0,,1,187,<python><performance><for-loop><pandas>,What is the most efficient way to loop through...,195364.0,"['Date,Open,High,Low,Close,Volume,Adj Close\r\...","[""Date,Open,High,Low,Close,Volume,Adj Close\r\...","[2011-10-19,27.37,27.47,27.01,27.13,42880000,2..."
2,49,49,8916746.0,2.0,<p>I have a dataframe <code>df</code> in panda...,,0,,2012-01-18T19:41:27.017,12.0,...,248237.0,,1,26,<python><csv><numpy><tab-delimited><pandas>,selecting across multiple columns with python ...,19805.0,"['df_greater_than10 = df[df[""colA""] > 10]\r\r\n']","['df_greater_than10 = df[df[""colA""] > 10]\r\r\n']","[df_greater_than10 = df[df[""colA""] > 10]\r\r\n]"
3,54,54,8997908.0,3.0,"<p>I recently came across the <a href=""http://...",,16,,2012-01-24T17:59:53.850,58.0,...,345660.0,,1,134,<python><r><join><data.table><pandas>,Why are pandas merges in python faster than da...,16173.0,[],[''],[]
4,73,73,9557319.0,8.0,<p>I used Enthought's python distribution as a...,2013-12-04T20:55:59.357,5,,2012-03-04T14:25:36.287,6.0,...,402468.0,,1,19,<python><numpy><scipy><enthought><pandas>,Open source Enthought Python alternative,6585.0,[],[''],[]
5,80,80,9620832.0,2.0,"<p>I stumbled across <a href=""http://pandas.py...",,2,,2012-03-06T17:01:47.107,5.0,...,1252759.0,,1,21,<python><pandas>,Simple cross-tabulation in pandas,9493.0,"['AB,100.00\r\r\nAB,200.00\r\r\nAC,150.00\r\r\...","['AB,100.00\r\r\nAB,200.00\r\r\nAC,150.00\r\r\...","[AB,100.00\r\r\nAB,200.00\r\r\nAC,150.00\r\r\n..."
6,91,91,9652858.0,3.0,<p>I'm new to python and pandas. I'm trying t...,,0,,2012-03-11T06:00:56.347,3.0,...,914308.0,,1,38,<python><pandas><tsv>,How to I load a tsv file into a Pandas DataFrame?,32040.0,['>>> df1 = DataFrame(csv.reader(open(\'c:/~/t...,['>>> df1 = DataFrame(csv.reader(open(\'c:/~/t...,[\r\r\n]
7,103,103,9762084.0,6.0,<p>I have manipulated some data using pandas a...,,0,,2012-03-18T12:53:06.683,13.0,...,939715.0,,1,34,<python><pandas>,Pandas convert dataframe to array of tuples,36433.0,['In [182]: data_set\r\r\nOut[182]: \r\r\n in...,['In [182]: data_set\r\r\nOut[182]: \r\r\n in...,"[In [182]: data_set\r\r\n(datetime.date(2012,2..."
8,107,107,9772031.0,2.0,"<p>I'm a beginning pandas user, and after stud...",,0,,2012-03-18T22:34:26.183,2.0,...,566942.0,,1,11,<python><pandas>,Add indexed column to DataFrame with pandas,8189.0,['df\r\r\n\r\r\n A ...,['df\r\r\n\r\r\n A ...,[df\r\r\n\r\r\ndf2\r\r\n\r\r\n]
9,109,109,9794891.0,2.0,<p>I want to perform a join/merge/append opera...,,0,,2012-03-20T13:36:09.263,6.0,...,566942.0,,1,17,<python><pandas>,join or merge with overwrite in pandas,6054.0,[],[''],[]
