In [23]:
import os
import xml.etree.ElementTree as ET
import logging
from subprocess import *

In [29]:
import sys
from subprocess import call
from pathlib import Path

def syscmd(cmd, encoding=''):
    """
    Runs a command on the system, waits for the command to finish, and then
    returns the text output of the command. If the command produces no text
    output, the command's return code will be returned instead.
    """
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT,
        close_fds=True)
    p.wait()
    output = p.stdout.read()
    if len(output) > 1:
        if encoding: return output.decode(encoding)
        else: return output
    return p.returncode
class Source_Loader():
    def __init__(self, dir):
        self.dir = dir
        self.compiler = 'g++'
        self.tokenizer = self._ensure_tokenizer_exists()
    def remove_non_ascii(self, code):
        if not code:
            return
        return ''.join([i if ord(i) < 128 else ' ' for i in code])
    
    def remove_apostrophe(self, code):
        result = ""
        for index, ch in enumerate(code):
            if ch == '\'':
                if code[index - 1].isdigit() and code[index + 1].isdigit():
                    continue
            result +=ch
        return result

    def remove_external_includes(self, code):
        exclude = []
        lines = []
        for line in code.split("\n"):
            skip = False
            for restricted in exclude:
                if restricted in line:
                    skip = True
                    break
            if skip:
                continue
            lines.append(line)
        return "\n".join(lines)
    def remove_unused_fcalls(self, code):
        '''
            Detect and remove unused function calls using the cppcheck tool.
            Dependency: cppcheck
            :return: code
        '''
        source_path = self.dir
        self.dump_source(source_path, code)
        log_path = "./log/log.xml"
        # Run cppcheck
        cmd  = "cppcheck --enable=all --xml --xml-version=2 {} 2>{}"\
            .format(source_path, log_path)
        syscmd(cmd)

        try:
            lines        = code.split("\n")
            tree         = ET.parse(log_path)
            root         = tree.getroot()
            errors       = root.find("errors")
            remove_lines = set()

            if not errors:
                return code

            for error in errors.findall("error"):

                if error.get('id') == "unusedFunction":
                    msg = error.get('msg')
                    fun = msg.split("'")[1]
                    location = int(error.find('location').get('line')) - 1
                    count_ph = 0
                    seen_the_end = False
                    index = location

                    for line in lines[location:]:
                        remove_lines.add(index)
                        index += 1
                        for ch in line:
                            if ch == "{":
                                count_ph += 1
                            elif ch == "}":
                                count_ph -= 1
                                seen_the_end = True

                        if count_ph == 0 and seen_the_end:
                            break

            lines = [line for idx, line in enumerate(lines)
                     if idx not in remove_lines and len(line) > 0]
            return "\n".join(lines)

        except Exception as e:
            logging.critical(e)
            return code
    def remove_unused_code(self, code):

        excluded = []

        res = []
        in_comment_block = False

        for line in code.split("\n"):

            line = line.strip()
            if len(line) == 0:
                continue

            if line.startswith("//"):
                if line.endswith("\\\\"):
                    in_comment_block = True
                continue
            elif in_comment_block and line.endswith("\\"):
                in_comment_block = True
                continue
            elif in_comment_block:
                in_comment_block = False
                continue

            for s in excluded:
                if line.startswith(s) and "*/" not in line:
                    break
            else:
                res.append(line)

        return "\n".join(res)
    def preprocess(self, code):
        pipeline = [
            self.remove_non_ascii,
            self.remove_external_includes,
            self.remove_unused_code,
            self.remove_unused_fcalls
        ]
        for func in pipeline:
            code = func(code)

        return code
    
    def _ensure_tokenizer_exists(self):

        tokenizer_dir  ="../tokenizer" +  "/src"

        tokenizer_exe = "tokenizer"

        tokenizer_path = tokenizer_dir + "/" + tokenizer_exe

        if not os.path.exists(tokenizer_path):
            current_path = Path.cwd()
            os.chdir(tokenizer_dir)
            run_system_command("{} *.cpp *.h -o {}".format(self.compiler, tokenizer_path))
            os.chdir(current_path)

        return tokenizer_path
    @staticmethod
    def dump_source(path: str, code: str):
        """ Write solution to file """
        with open(path, "w") as f:
            f.write(code)
    def run(self):
        #try:
            with open(self.dir, "r") as code_file:
                lines = code_file.readlines()
                code = []
                for index, line in enumerate(lines):
                    if len(line.strip()) > 0:
                        code.append(line.strip())
                code = '\n'.join(code)
            tokens = dict()
            code = self.preprocess(code)
            with open("test.cpp","w") as f:
                f.write(code)
        #except:
        #    shutil.rmtree("../cpg_8_2/{}/{}/".format(prob_name,code_name))
            

In [30]:
input_dir = "../test/"
for folder in os.listdir(input_dir):
    if (folder != ".DS_Store"):
        for file in os.listdir(os.path.join(input_dir, folder)):
            if (file.endswith(".cpp")):
                filepath = os.path.join(input_dir, folder, file)
                
                Source_Loader(filepath).run()
                
