### Validate the regex

In [2]:
#we will handle the following cases:
# the pipe |, grouping with parentheses ( ), the asterisk *, the plus +, ?, ., [] only

alphabet_nums = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [str(i) for i in range(10)]

def check_parentheses(regex, mode=1):
    
    if mode not in [1,2]:
        raise ValueError("Invalid mode")

    stack = []
    left_bracket = "("
    right_bracket = ")"

    if mode ==1:
        #check for empty parentheses
        for i in range(len(regex)):
            if regex[i] == "(":
                if i == len(regex)-1:
                    return False
                if regex[i+1] == ")":
                    return False

    if mode ==2:
        left_bracket = "["
        right_bracket = "]"

    for i in regex:
        if i == left_bracket:
            stack.append(i)
        elif i == right_bracket:
            if not stack:
                return False
            stack.pop()
    if len(stack) != 0:
        return False
    return True

#[a-] -> Invalid input as the range is incomplete
#[a-c -> Invalid input as the parentheses are not well formed
#[c-a] -> "Inverted Range or Non overlapping Ranges", You can SKIP this case and we won't test it. Assume any prompted range is correct


def check_range(regex):
    if not check_parentheses(regex, 2):
        return False
    
    for i in range(len(regex)):
        if regex[i] == "[":
            for j in range(i+1, len(regex)):
                if regex[j] == "-":
                    #check if the range is complete
                    if j == len(regex)-1:
                        return False
                    if regex[j+1] == "]":
                        return False
                    if regex[j-1] == "[":
                        return False
                if regex[j] == "]":
                    if j == i+1:
                        return False
                    break
                else:
                    if regex[j-1] == "-" and (regex[j] == "-" or regex[j+1] == "-"):
                        return False
    return True

def check_pipe(regex):
    for i in range(len(regex)):
        if regex[i] == "|":
            if i == 0 or i == len(regex)-1:
                return False
            if regex[i-1] not in ["*", "+", "?", ")", "]", "."] + alphabet_nums:
                return False
            if regex[i+1] not in ["(", "[","."] + alphabet_nums:
                return False
    return True

def check_asterisk_plus_qm(regex):
    
    for i in range(len(regex)):
        if regex[i] == "*" or regex[i] == "+" or regex[i] == "?":
            if i == 0:
                return False
            if regex[i-1] not in alphabet_nums + [")", "]", "."]:
                return False
    return True

def validate_regex(regex):
    # check if the regex is empty
    if not regex:
        return False
    #check for balanced parentheses
    if not check_parentheses(regex):
        print("Parentheses are invalid")
        return False
    #check for range
    if not check_range(regex):
        print("Range is invalid")
        return False
    #check for pipe
    if not check_pipe(regex):
        print("Used Pipe is invalid")
        return False
    #check for asterisk, plus, question mark
    if not check_asterisk_plus_qm(regex):
        print("Asterisk or/and Plus or/and Question mark is invalid")
        return False
    print("The regex is valid")
    return True

    
print(validate_regex("a(a|(a)|(c)|(b|c))*d"))
    

The regex is valid
True


### Convert to NFA

In [28]:
import json


# in this section we have a valid regex for which we will apply Shunting-Yard Algorithm to convert it to postfix notation

#we will use shunting yard algorithm to handle operators precedence
# the output will be a postfix notation
# we will then change the postfix notation to the required NFA using Thompson's rules

#first I need to express anding in regex i will use the & symbol so the op will be a&b instead of ab

def add_anding(regex):
    new_regex = ""
    for i in range(len(regex)):
        if i == len(regex)-1:
            new_regex += regex[i]
            break
        new_regex += regex[i]
        if i+1 < len(regex):
            if regex[i] in alphabet_nums + [")", "]", "*", "+", "?","."] and regex[i+1] in alphabet_nums + ["(", "[", "."]: 
                new_regex += "&"
    return new_regex

#test the function

# print(add_anding("a(a|(a)|(c)|(b|c))*d"))
# print(add_anding("abcd"))
# print(add_anding("(a*d)(bc)|(cd)"))
# print(add_anding("a(a|b)*c.a"))

###################################################################

#we will treat the range [] as a single symbol, 
# I will use the symbol # to represent the range,
# I will extract the range and replace it with a single symbol, 
#but I will keep the range in a list to use it later in the NFA construction
#the list will be the same length as the number of # in the new regex

def extract_range(regex):
    range_list = [] 
    new_regex = ""
    i = 0
    while i < len(regex):
        if regex[i] == "[":
            in_range = "["
            for j in range(i+1, len(regex)):
                if regex[j] == "]":
                    in_range += "]"
                    range_list.append(in_range)
                    new_regex += "#"
                    i = j
                    break
                in_range += regex[j]
        else:
            new_regex += regex[i]
        i += 1
    return new_regex, range_list

#test the extract range function

# test = "a[a-c][ad]d[dd]"

# new_regex, range_list = extract_range(add_anding(test))
# print(add_anding(test))
# print(new_regex)
# print(range_list)


###################################################################

#Shunting Yard Algorithm Steps:
# If the input symbol is a letter… append it directly to the output queue
# If the input symbol is an operator… if there exists an operator already on the top of the operator stack with higher or equal precedence than our current input symbol, remove the operator from the top of the operator stack and append it to the output queue. Do this until the current input symbol has a higher precedence than the symbol on the top of the operator stack, or the operator stack is empty.
# If the input symbol is an operator AND there is a left parenthesis on top of the stack… push the input symbol onto the stack on top of the left parenthesis.
# If the input symbol is an ( … append it to the operator stack
# If the input symbol is an ) … pop all operators from the operator stack and append them to the output queue until you find an ( . Then, you can remove both of those parentheses and continue with the algorithm.

#each # will be treated as a single symbol, we can get the actual range from the range_list 

#the pipe | is the union/ or operator, 
#the asterisk * is the kleene star, 
#the plus + is the one or more operator, 
#the question mark ? is the zero or one operator, the dot . any character (therefore we will use it as a single symbol),
#the operators precedence is as follows: kleene star = one or more = zero or one > and > or

def shunting_yard(regex):
    precedence = {"*": 3, "+": 3, "?": 3, "&": 2, "|": 1, "(": 0}
    output = ""
    stack = []
    for i in regex:
        if i in alphabet_nums + ["#", "."]:
            output += i
        elif i in ["*", "+", "?", "&", "|"]:
            while stack and precedence[stack[-1]] >= precedence[i]:
                output += stack.pop()
            stack.append(i)
        elif i == "(":
            stack.append(i)
        elif i == ")":
            while stack[-1] != "(":
                output += stack.pop()
            stack.pop()
    while stack:
        output += stack.pop()
    return output

#test the shunting yard algorithm
test ="a(a|b)*b"
add_anding_test = add_anding(test)
new_regex, range_list = extract_range(add_anding_test)
# print(add_anding_test)
# print(new_regex)
# print(range_list)
# print(shunting_yard(new_regex))


# Now we have the postfix notation, we will use Thompson's rules to construct the NFA
#then we construct a Json object to represent the NFA
#with the following format
'''
{
    "startingState": "S0",
    "S0": {
        "isTerminatingState": false,
        "A": "S1",
        "B": "S0"
    },
    "S1": {
        "isTerminatingState": true,
        "A": "S1",
        "B": "S1",
        """epsilon"""": [
                  "S1",
                  "S4"
            ]
    }
}
'''

#we can get the required NFA from the postfix notation easily by traversing the postfix notation from left to right
# and applying the following rules:
# 1- if we have a letter, we create a new NFA with two states, one is the starting state and the other is the terminating state
# 2- if we have the and operator, we concatenate the NFAs
# 3- if we have the or operator, we union the NFAs
# 4- if we have the kleene star operator, we apply the kleene star operation
# 5- if we have the one or more operator, we apply the one or more operation
# 6- if we have the zero or one operator, we apply the zero or one operation
# 7- if we have the dot operator, we create a new NFA with two states, one is the starting state and the other is the terminating state
# 8- if we have the range operator, we create a new NFA with two states, one is the starting state and the other is the terminating state
# 9- if we have the pipe operator, we union the NFAs

#if the operator is unary, we apply the operation on the last NFA in the stack
#if the operator is binary, we apply the operation on the last two NFAs in the stack

#we will use a stack to store the NFAs
#we will use a counter to give each state a unique name

#we will use the following class to represent the NFA nodes 


# Define the constants
EPSILON = 'ε'
alphabet_nums = [chr(i) for i in range(ord('a'), ord('z') + 1)] + [str(i) for i in range(10)]

# Define the class for representing NFA nodes
class Node:
    def __init__(self, stateName, isTerminatingState=False):
        self.stateName = stateName
        self.isTerminatingState = isTerminatingState
        self.transitions = {}
        self.epsilon = []

    def add_transition(self, symbol, stateName):
        self.transitions[symbol] = stateName

    def add_epsilon(self, stateName):
        self.epsilon.append(stateName)

    def __str__(self):
        return f"stateName: {self.stateName}, transitions: {self.transitions}, epsilon: {self.epsilon}, isTerminatingState: {self.isTerminatingState}"
    
    def __repr__(self):
        return f"stateName: {self.stateName}, transitions: {self.transitions}, epsilon: {self.epsilon}, isTerminatingState: {self.isTerminatingState}"

# Define the class for representing NFA
class NFA:
    def __init__(self):
        self.startingState = None
        self.terminatingState = None
        self.states = {}
    def __str__(self):
        return f"startingState: {self.startingState}, terminatingState: {self.terminatingState}"
    
    def __repr__(self):
        return f"startingState: {self.startingState}, terminatingState: {self.terminatingState}"
    
    def add_state(self, state):
        self.states[state.stateName] = state

    def get_starting_state(self):
        return self.startingState
    
    def get_terminating_state(self):
        return self.terminatingState
    
    def set_starting_state(self, stateName):
        self.startingState = stateName

    def set_terminating_state(self, stateName):
        self.terminatingState = stateName

#Create a new NFA
def create_NFA(state1, state2):
    nfa = NFA()
    nfa.set_starting_state(state1)
    nfa.set_terminating_state(state2)
    nfa.add_state(state1)
    nfa.add_state(state2)
    return nfa
# Define the function to create a new state
def create_state(isTerminatingState=False):
    global counter
    state = Node(f"S{counter}", isTerminatingState)
    counter += 1
    return state

# Define the function to create a new NFA with a single state (symbol, range, dot)
def create_single_state(symbol):
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_transition(symbol, state2.stateName)
    return create_NFA(state1, state2)

# Define the function to concatenate two NFAs
def concatenate_NFAs(nfa1, nfa2):
    nfa1.terminatingState.add_epsilon(nfa2.startingState.stateName)
    nfa1.terminatingState.isTerminatingState = False
    nfa1.terminatingState = nfa2.terminatingState
    for state in nfa2.states:
        nfa1.add_state(nfa2.states[state])
    return nfa1

# Define the function to union two NFAs
def union_NFAs(nfa1, nfa2):
    state1 = create_state()
    state1.add_epsilon(nfa1.startingState.stateName)
    state1.add_epsilon(nfa2.startingState.stateName)
    state2 = create_state(isTerminatingState=True)
    nfa1.terminatingState.add_epsilon(state2.stateName)
    nfa2.terminatingState.add_epsilon(state2.stateName)
    new_nfa = create_NFA(state1, state2)
    for state in nfa1.states:
        new_nfa.add_state(nfa1.states[state])
    for state in nfa2.states:
        new_nfa.add_state(nfa2.states[state])
    return new_nfa

# Define the function to apply the kleene star operation on an NFA
def kleene_star_NFA(nfa):
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_epsilon(nfa.startingState.stateName)
    state1.add_epsilon(state2.stateName)
    nfa.terminatingState.add_epsilon(nfa.startingState.stateName)
    nfa.terminatingState.add_epsilon(state2.stateName)
    nfa.terminatingState.isTerminatingState = False
    new_nfa = create_NFA(state1, state2)
    for state in nfa.states:
        new_nfa.add_state(nfa.states[state])
    return new_nfa
# Define the function to apply the one or more operation on an NFA
def one_or_more_NFA(nfa):
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_epsilon(nfa.startingState.stateName)
    nfa.terminatingState.add_epsilon(nfa.startingState.stateName)
    nfa.terminatingState.add_epsilon(state2.stateName)
    nfa.terminatingState.isTerminatingState = False
    new_nfa = create_NFA(state1, state2)
    for state in nfa.states:
        new_nfa.add_state(nfa.states[state])
    return new_nfa

# Define the function to apply the zero or one operation on an NFA
def zero_or_one_NFA(nfa):
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_epsilon(nfa.startingState.stateName)
    state1.add_epsilon(state2.stateName)
    nfa.terminatingState.add_epsilon(state2.stateName)
    nfa.terminatingState.isTerminatingState = False
    new_nfa = create_NFA(state1, state2)
    for state in nfa.states:
        new_nfa.add_state(nfa.states[state])
    return new_nfa
# Define the function to apply the dot operation on an NFA
def dot_NFA():
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_transition(".", state2.stateName)
    return create_NFA(state1, state2)

# Define the function to apply the range operation on an NFA
def range_NFA(range_list):
    state1 = create_state()
    state2 = create_state(isTerminatingState=True)
    state1.add_transition(range_list.pop(0), state2.stateName)
    return create_NFA(state1, state2)

# Define the function to construct the NFA from the postfix notation
def postfix_NFA(postfix, range_list):
    stack = []
    for i in postfix:
        if i in alphabet_nums:
            stack.append(create_single_state(i))
        elif i == "&":
            nfa2 = stack.pop()
            nfa1 = stack.pop()
            stack.append(concatenate_NFAs(nfa1, nfa2))
        elif i == "|":
            nfa2 = stack.pop()
            nfa1 = stack.pop()
            stack.append(union_NFAs(nfa1, nfa2))
        elif i == "*":
            nfa = stack.pop()
            stack.append(kleene_star_NFA(nfa))
        elif i == "+":
            nfa = stack.pop()
            stack.append(one_or_more_NFA(nfa))
        elif i == "?":
            nfa = stack.pop()
            stack.append(zero_or_one_NFA(nfa))
        elif i == ".":
            stack.append(dot_NFA())
        elif i == "#":
            stack.append(range_NFA(range_list))
    return stack.pop()

    



# Define the function to convert the NFA to a JSON object
def nfa_to_json(nfa):
    nfa_json = {}
    nfa_json["startingState"] = nfa.get_starting_state().stateName
    for state in nfa.states:
        nfa_json[state] = {}
        nfa_json[state]["isTerminatingState"] = nfa.states[state].isTerminatingState
        for symbol in nfa.states[state].transitions:
            nfa_json[state][symbol] = nfa.states[state].transitions[symbol]
        if nfa.states[state].epsilon:
            nfa_json[state]["eplison"] = nfa.states[state].epsilon
    return json.dumps(nfa_json, indent=4)


#test the nfa_to_json function
test ="a[a-z][b]"
add_anding_test = add_anding(test)
new_regex, range_list = extract_range(add_anding_test)
postfix = shunting_yard(new_regex)
print(add_anding_test)
print(new_regex)
print(postfix)

counter = 0

nfa = postfix_NFA(postfix, range_list)


print(nfa)
print(nfa.states)

#save the json object to a file
with open("nfa.json", "w") as f:
    f.write(nfa_to_json(nfa))
    


#test the postfix_NFA function
#test ="a(a|b)*b"



a&[a-z]&[b]
a&#&#
a#&#&
startingState: stateName: S0, transitions: {'a': 'S1'}, epsilon: [], isTerminatingState: False, terminatingState: stateName: S5, transitions: {}, epsilon: [], isTerminatingState: True
{'S0': stateName: S0, transitions: {'a': 'S1'}, epsilon: [], isTerminatingState: False, 'S1': stateName: S1, transitions: {}, epsilon: ['S2'], isTerminatingState: False, 'S2': stateName: S2, transitions: {'[a-z]': 'S3'}, epsilon: [], isTerminatingState: False, 'S3': stateName: S3, transitions: {}, epsilon: ['S4'], isTerminatingState: False, 'S4': stateName: S4, transitions: {'[b]': 'S5'}, epsilon: [], isTerminatingState: False, 'S5': stateName: S5, transitions: {}, epsilon: [], isTerminatingState: True}
