# Dataprocessing for getting only relavant data

### Data processing on train data first

In [1]:
import pandas as pd

with open('buggy_dataset/bugfixes_train.pickle', 'rb') as handle:
    df = pd.read_pickle(handle)  # Uses Pandas' built-in method

print(df.head())
print(df.describe())
print(df.info())

                                             after_merge  \
1790   def plot(result_pickle_file_path, show, plot_s...   
2433       def stream_logs(self):\n        """Stream ...   
26618      def addRecentProjectFile(self, projectFile...   
26622      def addSfmAugmentation(self, withMVS=False...   
28217      def load_pymathics_doc(self):\n        if ...   

                                            before_merge  \
1790   def plot(result_dict_file, show, plot_save_fil...   
2433       def stream_logs(self):\n        """Stream ...   
26618      def addRecentProjectFile(self, projectFile...   
26622      def addSfmAugmentation(self, withMVS=False...   
28217      def load_pymathics_doc(self):\n        if ...   

                                               filename  \
1790   rqalpha/mod/rqalpha_mod_sys_analyser/__init__.py   
2433                                 binderhub/build.py   
26618                                meshroom/ui/app.py   
26622                     meshroom/ui/reco

In [2]:
df1 = df[["before_merge", "after_merge", "traceback_type", "full_traceback"]]
print(df1.head())
    

                                            before_merge  \
1790   def plot(result_dict_file, show, plot_save_fil...   
2433       def stream_logs(self):\n        """Stream ...   
26618      def addRecentProjectFile(self, projectFile...   
26622      def addSfmAugmentation(self, withMVS=False...   
28217      def load_pymathics_doc(self):\n        if ...   

                                             after_merge     traceback_type  \
1790   def plot(result_pickle_file_path, show, plot_s...          TypeError   
2433       def stream_logs(self):\n        """Stream ...  FileNotFoundError   
26618      def addRecentProjectFile(self, projectFile...            OSError   
26622      def addSfmAugmentation(self, withMVS=False...       RuntimeError   
28217      def load_pymathics_doc(self):\n        if ...           KeyError   

                                          full_traceback  
1790   Traceback (most recent call last):\nFile "c:\p...  
2433   / # jupyter-repo2docker https://github.

In [3]:
import ast
import json
import difflib
import textwrap

# AST functions
def get_ast_json(code):
    """Convert Python code to an AST representation in JSON format."""
    if not isinstance(code, str) or not code.strip():
        print(f"Skipping empty or invalid code: {repr(code)}")
        return []
    try:
        # Remove all leading whitespace (including newlines) from the start
        code = code.lstrip()
        if not code:
            print(f"Code became empty after stripping: {repr(code)}")
            return []
        # Dedent to normalize any remaining indentation
        code = textwrap.dedent(code)
        tree = ast.parse(code)
        nodes = [ast.dump(node) for node in ast.walk(tree)]
        return nodes
    except SyntaxError as e:
        print(f"SyntaxError in code: {repr(code)}\nError: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error in code: {repr(code)}\nError: {e}")
        return []

def generate_ast_diff_json(old_code, new_code):
    """Generate a structured JSON diff between old and new ASTs."""
    old_ast = get_ast_json(old_code)
    new_ast = get_ast_json(new_code)

    # Compute AST differences
    diff = list(difflib.unified_diff(old_ast, new_ast, lineterm=""))

    old_ast_json = json.dumps(old_ast, indent=4)
    new_ast_json = json.dumps(new_ast, indent=4)
    return diff, old_ast_json, new_ast_json


In [4]:
data = df1.to_dict(orient='records')
for i, record in enumerate(data):
    try:
        old_code = record["before_merge"]
        new_code = record["after_merge"]
        record["ast_diff"], record["old_ast_json"], record["new_ast_json"] = generate_ast_diff_json(old_code, new_code)
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break





In [5]:
import pickle

# Save to a .pkl file
with open("train_data_postprocessed.pkl", "wb") as f:
    pickle.dump(data, f)

###  To read the train data after processing and checks

In [6]:
import pandas as pd

with open('train_data_postprocessed.pkl', 'rb') as handle:
    df = pd.read_pickle(handle)  # Uses Pandas' built-in method

# Convert to DataFrame
df = pd.DataFrame(df)

print(df.head())
print("/n" + "-" * 50 + "/n")
print(df.describe())
print("/n" + "-" * 50 + "/n")
print(df.info())

                                        before_merge  \
0  def plot(result_dict_file, show, plot_save_fil...   
1      def stream_logs(self):\n        """Stream ...   
2      def addRecentProjectFile(self, projectFile...   
3      def addSfmAugmentation(self, withMVS=False...   
4      def load_pymathics_doc(self):\n        if ...   

                                         after_merge     traceback_type  \
0  def plot(result_pickle_file_path, show, plot_s...          TypeError   
1      def stream_logs(self):\n        """Stream ...  FileNotFoundError   
2      def addRecentProjectFile(self, projectFile...            OSError   
3      def addSfmAugmentation(self, withMVS=False...       RuntimeError   
4      def load_pymathics_doc(self):\n        if ...           KeyError   

                                      full_traceback  \
0  Traceback (most recent call last):\nFile "c:\p...   
1  / # jupyter-repo2docker https://github.com/yuv...   
2  [2020-05-23 16:12:48,660][ERROR] Tracebac

In [7]:
# Checking each record in the DataFrame so there are no errors

for i, record in enumerate(df.to_dict(orient='records')):
    try:
        assert "before_merge" in record
        assert "after_merge" in record
        assert "traceback_type" in record
        assert "full_traceback" in record
        assert "ast_diff" in record
        assert "old_ast_json" in record
        assert "new_ast_json" in record
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break
print("All records processed successfully!")

All records processed successfully!


In [8]:
# Checking that there are no null values in the DataFrame
assert not df.isnull().values.any()
print("No null values in the DataFrame!")

No null values in the DataFrame!


### Extending this for valid and test pickle files in buggy dataset

In [9]:
with open('buggy_dataset/bugfixes_test.pickle', 'rb') as handle:
    df99 = pd.read_pickle(handle)  # Uses Pandas' built-in method

df100 = df99[["before_merge", "after_merge", "traceback_type", "full_traceback"]]

data = df100.to_dict(orient='records')
for i, record in enumerate(data):
    try:
        old_code = record["before_merge"]
        new_code = record["after_merge"]
        record["ast_diff"], record["old_ast_json"], record["new_ast_json"] = generate_ast_diff_json(old_code, new_code)
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break

# Save to a .pkl file
with open("test_data_postprocessed.pkl", "wb") as f:
    pickle.dump(data, f)
    

SyntaxError in code: "@classmethod\n    def _load_model_state(cls, checkpoint: Dict[str, Any], *cls_args, **cls_kwargs):\n        cls_spec = inspect.getfullargspec(cls.__init__)\n        cls_init_args_name = inspect.signature(cls).parameters.keys()\n        # pass in the values we saved automatically\n        if cls.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:\n            model_args = {}\n\n            # add some back compatibility, the actual one shall be last\n            for hparam_key in CHECKPOINT_PAST_HPARAMS_KEYS + (cls.CHECKPOINT_HYPER_PARAMS_KEY,):\n                if hparam_key in checkpoint:\n                    model_args.update(checkpoint[hparam_key])\n\n            model_args = _convert_loaded_hparams(model_args, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE))\n\n            args_name = checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_NAME)\n\n            if args_name == 'kwargs':\n                # in case the class cannot take any extra argument filter only the possible\n   



SyntaxError in code: '@command.command("export.file")\n    def file(self, fmt: str, f: flow.Flow, path: mitmproxy.types.Path) -> None:\n        """\n            Export a flow to path.\n        """\n        if fmt not in formats:\n            raise exceptions.CommandError("No such export format: %s" % fmt)\n        func = formats[fmt]  # type: typing.Any\n        v = func(f)\n        with open(path, "wb") as fp:\n            if isinstance(v, bytes):\n                fp.write(v)\n            else:\n                fp.write(v.encode("utf-8"))\n'
Error: unexpected indent (<unknown>, line 2)
SyntaxError in code: '@command.command("export.file")\n    def file(self, fmt: str, f: flow.Flow, path: mitmproxy.types.Path) -> None:\n        """\n            Export a flow to path.\n        """\n        if fmt not in formats:\n            raise exceptions.CommandError("No such export format: %s" % fmt)\n        func = formats[fmt]  # type: typing.Any\n        v = func(f)\n        try:\n            wi



SyntaxError in code: '@classmethod\n    def get_deps_from_req(cls, req, resolver=None):\n        # type: (Requirement, Optional["Resolver"]) -> Tuple[Set[str], Dict[str, Dict[str, Union[str, bool, List[str]]]]]\n        from .vendor.requirementslib.models.utils import _requirement_to_str_lowercase_name\n        from .vendor.requirementslib.models.requirements import Requirement\n        from requirementslib.utils import is_installable_dir\n        # TODO: this is way too complex, refactor this\n        constraints = set()  # type: Set[str]\n        locked_deps = dict()  # type: Dict[str, Dict[str, Union[str, bool, List[str]]]]\n        if (req.is_file_or_url or req.is_vcs) and not req.is_wheel:\n            # for local packages with setup.py files and potential direct url deps:\n            if req.is_vcs:\n                req_list, lockfile = get_vcs_deps(reqs=[req])\n                req = next(iter(req for req in req_list if req is not None), req_list)\n                entry = lockfil

In [10]:
with open('buggy_dataset/bugfixes_valid.pickle', 'rb') as handle:
    df990 = pd.read_pickle(handle)  # Uses Pandas' built-in method

df1000 = df990[["before_merge", "after_merge", "traceback_type", "full_traceback"]]

data = df1000.to_dict(orient='records')
for i, record in enumerate(data):
    try:
        old_code = record["before_merge"]
        new_code = record["after_merge"]
        record["ast_diff"], record["old_ast_json"], record["new_ast_json"] = generate_ast_diff_json(old_code, new_code)
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break

# Save to a .pkl file
with open("valid_data_postprocessed.pkl", "wb") as f:
    pickle.dump(data, f)
    



### Checks on test pickle file

In [11]:
import pandas as pd

with open('test_data_postprocessed.pkl', 'rb') as handle:
    df = pd.read_pickle(handle)  # Uses Pandas' built-in method

# Convert to DataFrame
df = pd.DataFrame(df)

print(df.head())
print("/n" + "-" * 50 + "/n")
print(df.describe())
print("/n" + "-" * 50 + "/n")
print(df.info())

                                        before_merge  \
0  def remove_lb_backend_address_pool_address(cmd...   
1      def split_action(arguments):\n        clas...   
2      def parse_series(self, data, **kwargs):\n ...   
3      def __init__(self, **kwargs):\n        # S...   
4      def dump_checkpoint(self, weights_only: bo...   

                                         after_merge  \
0  def remove_lb_backend_address_pool_address(cmd...   
1      def split_action(arguments):\n        clas...   
2      def parse_series(self, data, **kwargs):\n ...   
3      def __init__(self, **kwargs):\n        # S...   
4      def dump_checkpoint(self, weights_only: bo...   

                  traceback_type  \
0                 AttributeError   
1                 AttributeError   
2                 AttributeError   
3  pygmt.exceptions.GMTCLibError   
4                 AttributeError   

                                      full_traceback  \
0  john@Azure:~$ az network lb address-pool addre... 

In [12]:
# Checking each record in the DataFrame so there are no errors

for i, record in enumerate(df.to_dict(orient='records')):
    try:
        assert "before_merge" in record
        assert "after_merge" in record
        assert "traceback_type" in record
        assert "full_traceback" in record
        assert "ast_diff" in record
        assert "old_ast_json" in record
        assert "new_ast_json" in record
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break
print("All records processed successfully!")

All records processed successfully!


In [13]:
# Checking that there are no null values in the DataFrame
assert not df.isnull().values.any()
print("No null values in the DataFrame!")

No null values in the DataFrame!


### Checks on valid pickle file

In [14]:
import pandas as pd

with open('valid_data_postprocessed.pkl', 'rb') as handle:
    df = pd.read_pickle(handle)  # Uses Pandas' built-in method

# Convert to DataFrame
df = pd.DataFrame(df)

print(df.head())
print("/n" + "-" * 50 + "/n")
print(df.describe())
print("/n" + "-" * 50 + "/n")
print(df.info())

                                        before_merge  \
0  def hough_line_peaks(hspace, angles, dists, mi...   
1  def find_contours(array, level,\n             ...   
2  def _assemble_contours(points_iterator):\n    ...   
3  def file_or_url_context(resource_name):\n    "...   
4  def file_or_url_context(resource_name):\n    "...   

                                         after_merge traceback_type  \
0  def hough_line_peaks(hspace, angles, dists, mi...     IndexError   
1  def find_contours(array, level,\n             ...       KeyError   
2  def _assemble_contours(segments):\n    current...       KeyError   
3  def file_or_url_context(resource_name):\n    "...        OSError   
4  def file_or_url_context(resource_name):\n    "...        OSError   

                                      full_traceback  \
0  Traceback (most recent call last):\nFile "<ipy...   
1  ----------------------------------------------...   
2  ----------------------------------------------...   
3  Traceback

In [15]:
# Checking each record in the DataFrame so there are no errors

for i, record in enumerate(df.to_dict(orient='records')):
    try:
        assert "before_merge" in record
        assert "after_merge" in record
        assert "traceback_type" in record
        assert "full_traceback" in record
        assert "ast_diff" in record
        assert "old_ast_json" in record
        assert "new_ast_json" in record
    except Exception as e:
        print(f"Error processing record {i}: {e}")
        print(f"Record: {record}")
        break
print("All records processed successfully!")

All records processed successfully!


In [16]:
# Checking that there are no null values in the DataFrame
assert not df.isnull().values.any()
print("No null values in the DataFrame!")

No null values in the DataFrame!
