## Time to break it down! - POC

In [6]:
import re

block_tag_begin_regex = r'<(.*[^\/])>'
block_tag_end_regex = r'<(.*)\/>'
comment_regex = r'\/\*(.*)\*\/'

code = block_region
delimited_line_region = code.split('\n')

def get_bounds(regex_match):
    if type(regex_match) != re.Match:
        raise Exception(f'unsupported type - {type(regex_match)}')
    return regex_match.start(), regex_match.end()

for line_num, line in enumerate(delimited_line_region):
    print(f'Line num: {line_num} - Line: {line}')
    
    block_tag_begin = re.search(block_tag_begin_regex, line)
    if block_tag_begin:
        print('block begin detected')
        bounds = get_bounds(block_tag_begin)
        print(f'Bounds: {bounds}')
        
    block_tag_end = re.search(block_tag_end_regex, line)
    if block_tag_end:
        print('block end detected')
        bounds = get_bounds(block_tag_end)
        print(f'Bounds: {bounds}')
        
    tag_detect = re.search(comment_regex, line)
    if tag_detect:
        print('comment signature detected')
        bounds = get_bounds(tag_detect)
        print(f'Bounds: {bounds}')

Line num: 0 - Line: 
Line num: 1 - Line:     /*<BLAH>*/
block begin detected
Bounds: (6, 12)
comment signature detected
Bounds: (4, 14)
Line num: 2 - Line: fdsajkfds;lkfs
Line num: 3 - Line: /*<BLAH/>*/
block end detected
Bounds: (2, 9)
comment signature detected
Bounds: (0, 11)
Line num: 4 - Line: /*<FOOBAR>*/
block begin detected
Bounds: (2, 10)
comment signature detected
Bounds: (0, 12)
Line num: 5 - Line: alsdjflaskj
Line num: 6 - Line: alsdfalks;d
Line num: 7 - Line: /*<FOOBAR/>*/
block end detected
Bounds: (2, 11)
comment signature detected
Bounds: (0, 13)
Line num: 8 - Line: /*<BLAH>*/
block begin detected
Bounds: (2, 8)
comment signature detected
Bounds: (0, 10)
Line num: 9 - Line: alsdfaksdjf
Line num: 10 - Line: /*<BLAH/>*/
block end detected
Bounds: (2, 9)
comment signature detected
Bounds: (0, 11)
Line num: 11 - Line: asdhfkjasdhf
Line num: 12 - Line: 


## Class approach 

In [3]:
from collections import deque
from typing import List

class Tag:
    regex = r'\/\*(.*)\*\/'
    
    def __init__(self, match, line_num):
        self.match = match
        self.line_num = line_num
        
        self.name = match.group(1)
        self.start = self.match.start(1)
        self.end = self.match.end(1)
        
        self.string = self.match.string
        
    @classmethod
    def scan(cls, line):
        return re.search(cls.regex, line)

class BlockTagBegin(Tag):
    regex = r'<(.*[^\/])>'
    
class BlockTagEnd(Tag):
    regex = r'<(.*)\/>'
    
class BlockTag:
    start_tag = BlockTagBegin
    end_tag = BlockTagEnd
    
    def __init__(self, start: Tag, end: Tag):
        if start.name != end.name:
            raise Exception('names not equal')
        
        self.start = start
        self.end = end
        
        self.name = self.start.name
        
class Tagger:
    def __init__(self):
        self.tags = list()
        
    def scan(self, line, line_num):
        if type(Tag.regex) == list:
            regexes = Tag.regex
        elif type(Tag.regex) == str:
            regexes = [Tag.regex]
        else:
            raise Exception(f'not supported type - {type(Tag.regex)}')
        
        for regex in regexes:
            match = re.search(regex, line)
            if match:
                tag = Tag(match, line_num)
                self.tags.append(tag)


class BlockTagger:
    def __init__(self):
        self.tags = list()
        self.block_tags = list()
        self.nonblock_tags = list()
        
    def scan(self, tags: List[Tag]):
        
        for tag in tags:
            start_tag = BlockTag.start_tag
            start_match = start_tag.scan(tag.match.string)
            if start_match:
                self.tags.append(BlockTagBegin(start_match, tag.line_num))
            
            end_tag = BlockTag.end_tag
            end_match = end_tag.scan(tag.match.string)
            if end_match:
                self.tags.append(BlockTagEnd(end_match, tag.line_num))
            
            no_match = not(start_match or end_match)
            if no_match:
                self.nonblock_tags.append(tag)
        
    def get_blocks(self):
        self.tags.sort(key=lambda item : (item.name, item.line_num))
        print([(item.name, item.line_num, item.start, item.end) for item in self.tags])
        prev_tag_name = None
        begin_tags = deque()
        end_tags = deque()
        
        for tag in self.tags:
            if prev_tag_name != tag.name:
                self._connect_current_tag_block(begin_tags, end_tags)
                    
            if type(tag) == BlockTagBegin:
                begin_tags.append(tag)  
            elif type(tag) == BlockTagEnd:
                end_tags.append(tag)
            else:
                raise Exception('incorrect type')
                
            prev_tag_name = tag.name   
            
        self._connect_current_tag_block(begin_tags, end_tags)
        
    def _connect_current_tag_block(self, begin_tags, end_tags):
        if len(begin_tags) != len(end_tags):
            if len(begin_tags) > len(end_tags):
                broken_tag = begin_tags[-1]
                raise Exception(f'Ending tag missing for {broken_tag.name} on line num: {broken_tag.line_num}')
            else:
                broken_tag = end_tags[-1]
                raise Exception(f'Beginning tag missing for {broken_tag.name} on line num: {broken_tag.line_num}')
        
        for i in range(len(begin_tags)):
            start = begin_tags.popleft()
            end = end_tags.popleft()
            
            block_tag = BlockTag(start, end)
            print(f'Connect - Block_tag_name: {block_tag.name} - Line nums: {block_tag.start.line_num}:{block_tag.end.line_num}')
            self.block_tags.append(block_tag)

In [4]:
def get_line_tags(nonblock_tags):
    # Evaluate whether its a query tag or a line tag
    line_tags = list()
    
    print('Checking whether its a query tag or line tag')
    for tag in nonblock_tags:
        if tag.match.start() == 0:
            line_tags.append(tag)
            print(f'this is a line tag - {tag.name}')
        else:
            print('maybe this is just a query tag')
            
        print(f'Line number: {tag.line_num} - Indexes: {tag.match.start()}:{tag.match.end()}')
    return line_tags

def get_tag_keys(line_tags, block_tags):
    conditional_tags = list()
    conditional_tags.extend(line_tags)
    conditional_tags.extend(block_tags)
    
    keys = {tag.name for tag in conditional_tags}
    print(f'Found keys: {keys}\n')
    
def render_code(run_tags, line_tags, delimited_line_region):    

    selected_keys = dict()
    for name in run_tags:
        selected_keys[name] = 0
    
    sql_comment_prefix = '--'
    print(delimited_line_region)
    for tag in line_tags:
        end_idx = tag.match.end()
        print(f'{tag.string[tag.match.start():end_idx]}')
        
        comment_detected = False
        
        if delimited_line_region[tag.line_num][end_idx:end_idx+2] == sql_comment_prefix: # Accounts for the '--' char 
            print('-- detected after')
            end_idx += len(sql_comment_prefix)
            comment_detected = True
            
        if tag.name in selected_keys:
            delimited_line_region[tag.line_num] = delimited_line_region[tag.line_num][end_idx:]
        else:  
            prefix = ''
            if comment_detected:
                # rollback prior to sql_comment_prefix
                end_idx -= len(sql_comment_prefix)
            else:
                prefix = sql_comment_prefix
                
            delimited_line_region[tag.line_num] = prefix + delimited_line_region[tag.line_num][end_idx:]
            
    print('Final output:')
    print('\n'.join(delimited_line_region))

def get_block_tag_overlap_ranges(block_tags):
    block_tag_lookup = dict()
    
    for tag in block_tags:
        if tag.name not in block_tag_lookup:
            block_tag_lookup[tag.name] = list()
            
        print(f'BLOCK TAG: {tag.name}')
        print(tag.start.line_num)
        print(tag.end.line_num)
        print('\n')
        
        block_tag_lookup[tag.name].append([tag.start.line_num, tag.end.line_num])
    
    print(block_tag_lookup)
    return block_tag_lookup

def check_for_interval_overlap(interval_dict):
    # If blah is enabled, then we need to ensure NOS is disabled and output disabled intervals
    # If blah and nos is enabled, then merge both intervals

    line_usage_lookup = set()

    for key, intervals in interval_dict.items():
        for interval in intervals:
            start, end = interval
            for i in range(start, end+1):
                if i in line_usage_lookup:
                    print(f'overlap detected for lines - {start}:{end} - Tags are restricted per line num')
                    return True
                
                line_usage_lookup.add(i)
    
    return False

def get_interval_dict(tags, block_tags):
    interval_dict = dict()
    
    for tag in tags:
        if tag.name not in interval_dict:
            interval_dict[tag.name] = list()
        interval_dict[tag.name].append([tag.line_num, tag.line_num])

    for btag in block_tags:
        if btag.name not in interval_dict:
            interval_dict[btag.name] = list()
        interval_dict[btag.name].append([btag.start.line_num, btag.end.line_num])
    
    return interval_dict

def main(block_region, run_tags: List):
    # TODO - Needs to ensure #/* */ are ignored vs /* #bladfhdsfds */ along with limiting tag length to 128 characters
    
    code = block_region
    delimited_line_region = code.split('\n')
    
    IGNORE_SQL_COMMENTED_LINES = '^[\s]*--'
    tagger = Tagger()
    for line_num, line in enumerate(delimited_line_region):
        if re.search(IGNORE_SQL_COMMENTED_LINES, line):
            continue
            
        print(f'Line num: {line_num} - Line: {line}')
        tagger.scan(line, line_num)
    print('\n')
    print(f'Tags:\n{tagger.tags}\n')

    block_tagger = BlockTagger()
    block_tagger.scan(tagger.tags)
    print(f'Block signature tags(begin/end):\n{block_tagger.tags}\n')

    block_tagger.get_blocks()
    print(f'Block tags (as a whole):\n{block_tagger.block_tags}\n')
    print(f'Non-block tags found after classifying:\n{block_tagger.nonblock_tags}\n')

    for block_tag in block_tagger.block_tags:
        print(f'Block_tag_name: {block_tag.name} - Line nums: {block_tag.start.line_num}:{block_tag.end.line_num}')
    
    line_tags = get_line_tags(block_tagger.nonblock_tags)
    block_tag_linenum_dict = get_block_tag_overlap_ranges(block_tagger.block_tags)

    get_tag_keys(line_tags, block_tagger.block_tags)
    
    interval_dict = get_interval_dict(tags=line_tags, block_tags=block_tagger.block_tags)
    print(f'Interval Dict: {interval_dict}')
    check_for_interval_overlap(interval_dict)
    render_code(run_tags, line_tags, delimited_line_region)

In [7]:
block_region = """
    /*<BLAH>*/
fdsajkfds;lkfs
/*<BLAH/>*/
/*<FOOBAR>*/
alsdjflaskj
alsdfalks;d
/*<FOOBAR/>*/
/*<BLAH>*/
alsdfaksdjf
/*<BLAH/>*/
asdhfkjasdhf
"""    
main(block_region, run_tags=['BLAH'])

Line num: 0 - Line: 
Line num: 1 - Line:     /*<BLAH>*/
Line num: 2 - Line: fdsajkfds;lkfs
Line num: 3 - Line: /*<BLAH/>*/
Line num: 4 - Line: /*<FOOBAR>*/
Line num: 5 - Line: alsdjflaskj
Line num: 6 - Line: alsdfalks;d
Line num: 7 - Line: /*<FOOBAR/>*/
Line num: 8 - Line: /*<BLAH>*/
Line num: 9 - Line: alsdfaksdjf
Line num: 10 - Line: /*<BLAH/>*/
Line num: 11 - Line: asdhfkjasdhf
Line num: 12 - Line: 


Tags:
[<__main__.Tag object at 0x107155bd0>, <__main__.Tag object at 0x1071558d0>, <__main__.Tag object at 0x107155750>, <__main__.Tag object at 0x107155350>, <__main__.Tag object at 0x1071551d0>, <__main__.Tag object at 0x107151310>]

Block signature tags(begin/end):
[<__main__.BlockTagBegin object at 0x1071aa2d0>, <__main__.BlockTagEnd object at 0x1071aa290>, <__main__.BlockTagBegin object at 0x1071aa190>, <__main__.BlockTagEnd object at 0x1071aa310>, <__main__.BlockTagBegin object at 0x1071aa450>, <__main__.BlockTagEnd object at 0x1071aa5d0>]

[('BLAH', 1, 7, 11), ('BLAH', 3, 3, 7),