### Validate the regex

In [2]:
#we will handle the following cases:
# the pipe |, grouping with parentheses ( ), the asterisk *, the plus +, ?, ., [] only

alphabet_nums = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [str(i) for i in range(10)]

def check_parentheses(regex, mode=1):
    
    if mode not in [1,2]:
        raise ValueError("Invalid mode")

    stack = []
    left_bracket = "("
    right_bracket = ")"

    if mode ==1:
        #check for empty parentheses
        for i in range(len(regex)):
            if regex[i] == "(":
                if i == len(regex)-1:
                    return False
                if regex[i+1] == ")":
                    return False

    if mode ==2:
        left_bracket = "["
        right_bracket = "]"

    for i in regex:
        if i == left_bracket:
            stack.append(i)
        elif i == right_bracket:
            if not stack:
                return False
            stack.pop()
    if len(stack) != 0:
        return False
    return True

#[a-] -> Invalid input as the range is incomplete
#[a-c -> Invalid input as the parentheses are not well formed
#[c-a] -> "Inverted Range or Non overlapping Ranges", You can SKIP this case and we won't test it. Assume any prompted range is correct


def check_range(regex):
    if not check_parentheses(regex, 2):
        return False
    
    for i in range(len(regex)):
        if regex[i] == "[":
            for j in range(i+1, len(regex)):
                if regex[j] == "-":
                    #check if the range is complete
                    if j == len(regex)-1:
                        return False
                    if regex[j+1] == "]":
                        return False
                    if regex[j-1] == "[":
                        return False
                if regex[j] == "]":
                    if j == i+1:
                        return False
                    break
                else:
                    if regex[j-1] == "-" and (regex[j] == "-" or regex[j+1] == "-"):
                        return False
    return True

def check_pipe(regex):
    for i in range(len(regex)):
        if regex[i] == "|":
            if i == 0 or i == len(regex)-1:
                return False
            if regex[i-1] not in ["*", "+", "?", ")", "]", "."] + alphabet_nums:
                return False
            if regex[i+1] not in ["(", "[","."] + alphabet_nums:
                return False
    return True

def check_asterisk_plus_qm(regex):
    
    for i in range(len(regex)):
        if regex[i] == "*" or regex[i] == "+" or regex[i] == "?":
            if i == 0:
                return False
            if regex[i-1] not in alphabet_nums + [")", "]", "."]:
                return False
    return True

def validate_regex(regex):
    # check if the regex is empty
    if not regex:
        return False
    #check for balanced parentheses
    if not check_parentheses(regex):
        print("Parentheses are invalid")
        return False
    #check for range
    if not check_range(regex):
        print("Range is invalid")
        return False
    #check for pipe
    if not check_pipe(regex):
        print("Used Pipe is invalid")
        return False
    #check for asterisk, plus, question mark
    if not check_asterisk_plus_qm(regex):
        print("Asterisk or/and Plus or/and Question mark is invalid")
        return False
    print("The regex is valid")
    return True

    
print(validate_regex("a(a|(a)|(c)|(b|c))*d"))
    

The regex is valid
True


### Convert to NFA

In [30]:
# in this section we have a valid regex for which we will apply Shunting-Yard Algorithm to convert it to postfix notation

#we will use shunting yard algorithm to handle operators precedence
# the output will be a postfix notation
# we will then change the postfix notation to the required NFA using Thompson's rules

#first I need to express anding in regex i will use the & symbol so the op will be a&b instead of ab

def add_anding(regex):
    new_regex = ""
    for i in range(len(regex)):
        if i == len(regex)-1:
            new_regex += regex[i]
            break
        new_regex += regex[i]
        if i+1 < len(regex):
            if regex[i] in alphabet_nums + [")", "]", "*", "+", "?","."] and regex[i+1] in alphabet_nums + ["(", "[", "."]: 
                new_regex += "&"
    return new_regex

#test the function

# print(add_anding("a(a|(a)|(c)|(b|c))*d"))
# print(add_anding("abcd"))
# print(add_anding("(a*d)(bc)|(cd)"))
# print(add_anding("a(a|b)*c.a"))

###################################################################

#we will treat the range [] as a single symbol, 
# I will use the symbol # to represent the range,
# I will extract the range and replace it with a single symbol, 
#but I will keep the range in a list to use it later in the NFA construction
#the list will be the same length as the number of # in the new regex

def extract_range(regex):
    range_list = [] 
    new_regex = ""
    i = 0
    while i < len(regex):
        if regex[i] == "[":
            in_range = "["
            for j in range(i+1, len(regex)):
                if regex[j] == "]":
                    in_range += "]"
                    range_list.append(in_range)
                    new_regex += "#"
                    i = j
                    break
                in_range += regex[j]
        else:
            new_regex += regex[i]
        i += 1
    return new_regex, range_list

#test the extract range function

# test = "a[a-c][ad]d[dd]"

# new_regex, range_list = extract_range(add_anding(test))
# print(add_anding(test))
# print(new_regex)
# print(range_list)


###################################################################

#Shunting Yard Algorithm Steps:
# If the input symbol is a letter… append it directly to the output queue
# If the input symbol is an operator… if there exists an operator already on the top of the operator stack with higher or equal precedence than our current input symbol, remove the operator from the top of the operator stack and append it to the output queue. Do this until the current input symbol has a higher precedence than the symbol on the top of the operator stack, or the operator stack is empty.
# If the input symbol is an operator AND there is a left parenthesis on top of the stack… push the input symbol onto the stack on top of the left parenthesis.
# If the input symbol is an ( … append it to the operator stack
# If the input symbol is an ) … pop all operators from the operator stack and append them to the output queue until you find an ( . Then, you can remove both of those parentheses and continue with the algorithm.

#each # will be treated as a single symbol, we can get the actual range from the range_list 

#the pipe | is the union/ or operator, 
#the asterisk * is the kleene star, 
#the plus + is the one or more operator, 
#the question mark ? is the zero or one operator, the dot . any character (therefore we will use it as a single symbol),
#the operators precedence is as follows: kleene star = one or more = zero or one > and > or

def shunting_yard(regex):
    precedence = {"*": 3, "+": 3, "?": 3, "&": 2, "|": 1, "(": 0}
    output = ""
    stack = []
    for i in regex:
        if i in alphabet_nums + ["#"]:
            output += i
        elif i in ["*", "+", "?", "&", "|"]:
            while stack and precedence[stack[-1]] >= precedence[i]:
                output += stack.pop()
            stack.append(i)
        elif i == "(":
            stack.append(i)
        elif i == ")":
            while stack[-1] != "(":
                output += stack.pop()
            stack.pop()
    while stack:
        output += stack.pop()
    return output

#test the shunting yard algorithm
test ="a(a|b)*b"
add_anding_test = add_anding(test)
new_regex, range_list = extract_range(add_anding_test)
print(add_anding_test)
print(new_regex)
print(range_list)
print(shunting_yard(new_regex))


a&(a|b)*&b
a&(a|b)*&b
[]
aab|*&b&
