From 33cd2e04380349891a1b93eb8792e13a374ceb08 Mon Sep 17 00:00:00 2001 From: Seggan Date: Mon, 5 Dec 2022 18:10:41 -0500 Subject: [PATCH 1/5] Lexer --- .gitignore | 3 +- src/main.rs | 8 ++++- src/parsing.rs | 70 +++++++++++++++++++++++++++++++++++++++++++ src/parsing/lexer.rs | 62 ++++++++++++++++++++++++++++++++++++++ src/parsing/parser.rs | 6 ++++ 5 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 src/parsing.rs create mode 100644 src/parsing/lexer.rs create mode 100644 src/parsing/parser.rs diff --git a/.gitignore b/.gitignore index be2152e..b9012db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target /.idea -Cargo.lock \ No newline at end of file +Cargo.lock +test.txt \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e7a11a9..41a84af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,9 @@ +mod parsing; + fn main() { - println!("Hello, world!"); + let args: Vec = std::env::args().collect(); + let file = std::fs::read_to_string(&args[1]).expect("File not found"); + let tokens = parsing::lexer::lex(&file); + println!("{:?}", tokens); + parsing::parser::parse(&tokens); } diff --git a/src/parsing.rs b/src/parsing.rs new file mode 100644 index 0000000..e9f87df --- /dev/null +++ b/src/parsing.rs @@ -0,0 +1,70 @@ +pub mod lexer; +pub mod parser; + +#[derive(Debug)] +pub enum Operator { + Plus, + Minus, + Multiply, + Divide, + Modulo, + Exponent, + + Equal, + NotEqual, + LessThan, + GreaterThan, + LessThanEqual, + GreaterThanEqual, + Or, + And, + Not, + Implies, + + Contains, + NotContains, + Subset, + StrictSubset, + Superset, + StrictSuperset, + Union, + Intersection, + Difference, + SymmetricDifference +} + +impl Operator { + pub fn from_char(c: &char) -> Option { + match c { + '+' => Some(Operator::Plus), + '-' => Some(Operator::Minus), + '*' => Some(Operator::Multiply), + '/' => Some(Operator::Divide), + '%' => Some(Operator::Modulo), + '^' => Some(Operator::Exponent), + + '=' => Some(Operator::Equal), + '≠' => Some(Operator::NotEqual), + '<' => Some(Operator::LessThan), + '>' => Some(Operator::GreaterThan), + '≤' => Some(Operator::LessThanEqual), + '≥' => Some(Operator::GreaterThanEqual), + '∨' => Some(Operator::Or), + '∧' => Some(Operator::And), + '¬' => Some(Operator::Not), + '⇒' => Some(Operator::Implies), + + '∈' => Some(Operator::Contains), + '∉' => Some(Operator::NotContains), + '⊆' => Some(Operator::Subset), + '⊂' => Some(Operator::StrictSubset), + '⊇' => Some(Operator::Superset), + '⊃' => Some(Operator::StrictSuperset), + '∪' => Some(Operator::Union), + '∩' => Some(Operator::Intersection), + '\\' => Some(Operator::Difference), + '∆' => Some(Operator::SymmetricDifference), + _ => None + } + } +} \ No newline at end of file diff --git a/src/parsing/lexer.rs b/src/parsing/lexer.rs new file mode 100644 index 0000000..5b0d29e --- /dev/null +++ b/src/parsing/lexer.rs @@ -0,0 +1,62 @@ +use super::Operator; + +#[derive(Debug)] +pub enum Token { + Operator(Operator), + Identifier(String), + Number(i64), + OpenBrace, + CloseBrace, + OpenParen, + CloseParen, + Quote +} + +pub fn lex(str: &String) -> Vec { + let iter: Vec = str.chars().collect(); + let mut tokens = Vec::new(); + let mut i = 0; + while let Some(next_char) = iter.get(i) { + i += 1; + match next_char { + '(' => tokens.push(Token::OpenParen), + ')' => tokens.push(Token::CloseParen), + '{' => tokens.push(Token::OpenBrace), + '}' => tokens.push(Token::CloseBrace), + '"' => tokens.push(Token::Quote), + ' ' => continue, + _ => { + if next_char.is_ascii_digit() { + let mut number = String::new(); + number.push(*next_char); + while let Some(next_char) = iter.get(i) { + if next_char.is_ascii_digit() { + number.push(*next_char); + i += 1; + } else { + break; + } + } + tokens.push(Token::Number(number.parse().unwrap())); + } else if next_char.is_ascii_alphabetic() { + let mut identifier = String::new(); + identifier.push(*next_char); + while let Some(next_char) = iter.get(i) { + if next_char.is_ascii_alphanumeric() { + identifier.push(*next_char); + i += 1; + } else { + break; + } + } + tokens.push(Token::Identifier(identifier)); + } else { + if let Some(operator) = Operator::from_char(next_char) { + tokens.push(Token::Operator(operator)); + } + } + } + } + } + tokens +} \ No newline at end of file diff --git a/src/parsing/parser.rs b/src/parsing/parser.rs new file mode 100644 index 0000000..0e62786 --- /dev/null +++ b/src/parsing/parser.rs @@ -0,0 +1,6 @@ +use super::lexer; +use super::Operator; + +pub fn parse(tokens: &[lexer::Token]) { + +} \ No newline at end of file From e0459a322379b1e98a6b5bca298138618ec09bb6 Mon Sep 17 00:00:00 2001 From: Seggan Date: Mon, 5 Dec 2022 21:22:22 -0500 Subject: [PATCH 2/5] Half done manual parser --- src/main.rs | 2 ++ src/parsing.rs | 10 +++++++++- src/parsing/ast.rs | 10 ++++++++++ src/parsing/parser.rs | 32 +++++++++++++++++++++++++++++--- 4 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 src/parsing/ast.rs diff --git a/src/main.rs b/src/main.rs index 41a84af..d0444ee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,5 @@ +extern crate core; + mod parsing; fn main() { diff --git a/src/parsing.rs b/src/parsing.rs index e9f87df..2586244 100644 --- a/src/parsing.rs +++ b/src/parsing.rs @@ -1,7 +1,8 @@ pub mod lexer; pub mod parser; +pub mod ast; -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum Operator { Plus, Minus, @@ -67,4 +68,11 @@ impl Operator { _ => None } } + + fn arity(&self) -> i32 { + match self { + Operator::Not => 1, + _ => 2 + } + } } \ No newline at end of file diff --git a/src/parsing/ast.rs b/src/parsing/ast.rs new file mode 100644 index 0000000..a18d730 --- /dev/null +++ b/src/parsing/ast.rs @@ -0,0 +1,10 @@ +use std::fmt::Debug; +use super::Operator; + +#[derive(Debug)] +pub enum AstNode { + Number(i64), + Variable(String), + UnaryExpression(Operator, Box), + BinaryExpression(Box, Operator, Box) +} \ No newline at end of file diff --git a/src/parsing/parser.rs b/src/parsing/parser.rs index 0e62786..53dd9c5 100644 --- a/src/parsing/parser.rs +++ b/src/parsing/parser.rs @@ -1,6 +1,32 @@ -use super::lexer; -use super::Operator; +use super::lexer::Token; +use super::ast::AstNode; -pub fn parse(tokens: &[lexer::Token]) { +pub fn parse(tokens: &[Token]) -> AstNode { + parse_expression(tokens, &mut 0) +} +fn parse_expression(tokens: &[Token], index: &mut usize) -> AstNode { + let next = &tokens[*index]; + return match next { + Token::Number(number) => AstNode::Number(*number), + Token::Identifier(variable) => AstNode::Variable(variable.clone()), + Token::Operator(operator) => { + // we have a unary operator + if operator.arity() == 1 { + AstNode::UnaryExpression(*operator, Box::new(parse_expression(tokens, index))) + } else { + panic!("Invalid binary operator"); + } + } + Token::OpenBrace => todo!(), + Token::CloseBrace => panic!("Invalid closing brace"), + Token::OpenParen => { + *index += 1; + let expression = parse_expression(tokens, index); + *index += 1; + expression + }, + Token::CloseParen => panic!("Invalid closing parenthesis"), + Token::Quote => todo!() + } } \ No newline at end of file From 639dc1a208660640de3b14115f87d10ce1c307a1 Mon Sep 17 00:00:00 2001 From: Seggan Date: Tue, 6 Dec 2022 13:24:53 -0500 Subject: [PATCH 3/5] Rudimentary full parser --- src/main.rs | 3 ++- src/parsing/ast.rs | 11 +++++++-- src/parsing/parser.rs | 52 +++++++++++++++++++++++++------------------ 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/main.rs b/src/main.rs index d0444ee..6b17a15 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,5 +7,6 @@ fn main() { let file = std::fs::read_to_string(&args[1]).expect("File not found"); let tokens = parsing::lexer::lex(&file); println!("{:?}", tokens); - parsing::parser::parse(&tokens); + let parsed = parsing::parser::parse(&tokens); + println!("{:?}", parsed); } diff --git a/src/parsing/ast.rs b/src/parsing/ast.rs index a18d730..6277dfa 100644 --- a/src/parsing/ast.rs +++ b/src/parsing/ast.rs @@ -5,6 +5,13 @@ use super::Operator; pub enum AstNode { Number(i64), Variable(String), - UnaryExpression(Operator, Box), - BinaryExpression(Box, Operator, Box) + UnaryExpression { + operator: Operator, + operand: Box + }, + BinaryExpression { + left: Box, + operator: Operator, + right: Box + } } \ No newline at end of file diff --git a/src/parsing/parser.rs b/src/parsing/parser.rs index 53dd9c5..190d3d2 100644 --- a/src/parsing/parser.rs +++ b/src/parsing/parser.rs @@ -1,32 +1,40 @@ use super::lexer::Token; use super::ast::AstNode; -pub fn parse(tokens: &[Token]) -> AstNode { +pub fn parse(tokens: &Vec) -> Option { parse_expression(tokens, &mut 0) } -fn parse_expression(tokens: &[Token], index: &mut usize) -> AstNode { - let next = &tokens[*index]; - return match next { - Token::Number(number) => AstNode::Number(*number), - Token::Identifier(variable) => AstNode::Variable(variable.clone()), - Token::Operator(operator) => { - // we have a unary operator - if operator.arity() == 1 { - AstNode::UnaryExpression(*operator, Box::new(parse_expression(tokens, index))) - } else { - panic!("Invalid binary operator"); +fn parse_expression(tokens: &[Token], index: &mut usize) -> Option { + let next = try_parse_literal(tokens, index); + return if let Some(next) = next { + let token = tokens.get(*index); + if let Some(Token::Operator(operator)) = token { + *index += 1; + let right = parse_expression(tokens, index); + if let Some(right) = right { + return Some(AstNode::BinaryExpression { + left: Box::new(next), + operator: *operator, + right: Box::new(right) + }); } } - Token::OpenBrace => todo!(), - Token::CloseBrace => panic!("Invalid closing brace"), - Token::OpenParen => { - *index += 1; - let expression = parse_expression(tokens, index); - *index += 1; - expression - }, - Token::CloseParen => panic!("Invalid closing parenthesis"), - Token::Quote => todo!() + Some(next) + } else { + None + } +} + +fn try_parse_literal(tokens: &[Token], index: &mut usize) -> Option { + let token = &tokens[*index]; + let result = match token { + Token::Number(n) => Some(AstNode::Number(*n)), + Token::Identifier(v) => Some(AstNode::Variable(v.clone())), + _ => None + }; + if result.is_some() { + *index += 1; } + result } \ No newline at end of file From ffa423bfc5c3c7dcdda3bc5c475889ba909c2ccc Mon Sep 17 00:00:00 2001 From: Seggan Date: Tue, 6 Dec 2022 17:59:57 -0500 Subject: [PATCH 4/5] Finish basic parser with operator precedence --- Cargo.toml | 1 + src/main.rs | 4 +- src/parsing/ast.rs | 17 ---- src/parsing/lexer.rs | 21 +++-- src/parsing/mod.rs | 7 ++ src/{parsing.rs => parsing/operator.rs} | 37 ++++++-- src/parsing/parser.rs | 115 ++++++++++++++++++------ 7 files changed, 145 insertions(+), 57 deletions(-) delete mode 100644 src/parsing/ast.rs create mode 100644 src/parsing/mod.rs rename src/{parsing.rs => parsing/operator.rs} (61%) diff --git a/Cargo.toml b/Cargo.toml index 52467fb..8d01d82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +num-bigint = "0.4.3" diff --git a/src/main.rs b/src/main.rs index 6b17a15..82351e6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,8 +5,8 @@ mod parsing; fn main() { let args: Vec = std::env::args().collect(); let file = std::fs::read_to_string(&args[1]).expect("File not found"); - let tokens = parsing::lexer::lex(&file); + let tokens = parsing::lex(&file); println!("{:?}", tokens); - let parsed = parsing::parser::parse(&tokens); + let parsed = parsing::parse(&tokens); println!("{:?}", parsed); } diff --git a/src/parsing/ast.rs b/src/parsing/ast.rs deleted file mode 100644 index 6277dfa..0000000 --- a/src/parsing/ast.rs +++ /dev/null @@ -1,17 +0,0 @@ -use std::fmt::Debug; -use super::Operator; - -#[derive(Debug)] -pub enum AstNode { - Number(i64), - Variable(String), - UnaryExpression { - operator: Operator, - operand: Box - }, - BinaryExpression { - left: Box, - operator: Operator, - right: Box - } -} \ No newline at end of file diff --git a/src/parsing/lexer.rs b/src/parsing/lexer.rs index 5b0d29e..d1fcd72 100644 --- a/src/parsing/lexer.rs +++ b/src/parsing/lexer.rs @@ -1,15 +1,16 @@ +use num_bigint::BigInt; use super::Operator; #[derive(Debug)] pub enum Token { Operator(Operator), Identifier(String), - Number(i64), + Number(BigInt), + String(String), OpenBrace, CloseBrace, OpenParen, - CloseParen, - Quote + CloseParen } pub fn lex(str: &String) -> Vec { @@ -23,7 +24,17 @@ pub fn lex(str: &String) -> Vec { ')' => tokens.push(Token::CloseParen), '{' => tokens.push(Token::OpenBrace), '}' => tokens.push(Token::CloseBrace), - '"' => tokens.push(Token::Quote), + '"' => { + let mut string = String::new(); + while let Some(next_char) = iter.get(i) { + i += 1; + if *next_char == '"' && iter[i - 2] != '\\' { + break; + } + string.push(*next_char); + } + tokens.push(Token::String(string)); + } ' ' => continue, _ => { if next_char.is_ascii_digit() { @@ -37,7 +48,7 @@ pub fn lex(str: &String) -> Vec { break; } } - tokens.push(Token::Number(number.parse().unwrap())); + tokens.push(Token::Number(BigInt::parse_bytes(number.as_bytes(), 10).unwrap())); } else if next_char.is_ascii_alphabetic() { let mut identifier = String::new(); identifier.push(*next_char); diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs new file mode 100644 index 0000000..1df2fe3 --- /dev/null +++ b/src/parsing/mod.rs @@ -0,0 +1,7 @@ +mod lexer; +mod operator; +mod parser; + +pub use lexer::lex; +pub use parser::parse; +pub use operator::Operator; \ No newline at end of file diff --git a/src/parsing.rs b/src/parsing/operator.rs similarity index 61% rename from src/parsing.rs rename to src/parsing/operator.rs index 2586244..d857b0e 100644 --- a/src/parsing.rs +++ b/src/parsing/operator.rs @@ -1,8 +1,6 @@ -pub mod lexer; -pub mod parser; -pub mod ast; +use std::fmt::Debug; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum Operator { Plus, Minus, @@ -10,6 +8,7 @@ pub enum Operator { Divide, Modulo, Exponent, + Negate, Equal, NotEqual, @@ -31,7 +30,8 @@ pub enum Operator { Union, Intersection, Difference, - SymmetricDifference + SymmetricDifference, + Size } impl Operator { @@ -43,6 +43,7 @@ impl Operator { '/' => Some(Operator::Divide), '%' => Some(Operator::Modulo), '^' => Some(Operator::Exponent), + '¯' => Some(Operator::Negate), '=' => Some(Operator::Equal), '≠' => Some(Operator::NotEqual), @@ -65,14 +66,36 @@ impl Operator { '∩' => Some(Operator::Intersection), '\\' => Some(Operator::Difference), '∆' => Some(Operator::SymmetricDifference), + '#' => Some(Operator::Size), _ => None } } - fn arity(&self) -> i32 { + pub fn arity(&self) -> i32 { match self { - Operator::Not => 1, + Operator::Not | Operator::Negate | Operator::Size => 1, _ => 2 } } + + pub fn precedence(&self) -> i32 { + match self { + Operator::Contains | Operator::NotContains | Operator::Subset | Operator::StrictSubset + | Operator::Superset | Operator::StrictSuperset | Operator::Union | Operator::Intersection + | Operator::Difference | Operator::SymmetricDifference => 1, + + Operator::Or | Operator::And | Operator::Implies => 2, + + Operator::Equal | Operator::NotEqual | Operator::LessThan | Operator::GreaterThan + | Operator::LessThanEqual | Operator::GreaterThanEqual => 3, + + Operator::Plus | Operator::Minus => 4, + + Operator::Multiply | Operator::Divide | Operator::Modulo => 5, + + Operator::Exponent => 6, + + Operator::Not | Operator::Negate | Operator::Size => 7 + } + } } \ No newline at end of file diff --git a/src/parsing/parser.rs b/src/parsing/parser.rs index 190d3d2..02e399e 100644 --- a/src/parsing/parser.rs +++ b/src/parsing/parser.rs @@ -1,40 +1,103 @@ +use num_bigint::BigInt; use super::lexer::Token; -use super::ast::AstNode; +use super::Operator; pub fn parse(tokens: &Vec) -> Option { - parse_expression(tokens, &mut 0) + parse_expression(tokens) } -fn parse_expression(tokens: &[Token], index: &mut usize) -> Option { - let next = try_parse_literal(tokens, index); - return if let Some(next) = next { - let token = tokens.get(*index); - if let Some(Token::Operator(operator)) = token { - *index += 1; - let right = parse_expression(tokens, index); - if let Some(right) = right { - return Some(AstNode::BinaryExpression { - left: Box::new(next), - operator: *operator, - right: Box::new(right) - }); +fn parse_expression(tokens: &[Token]) -> Option { + let lowest = lowest_op(tokens); + if let Some(lowest_index) = lowest { + let op = match &tokens[lowest_index] { + Token::Operator(op) => op, + _ => panic!("Expected operator token") + }; + let right = Box::new(parse_expression(&tokens[lowest_index + 1..]) + .expect("Failed to parse right side of expression")); + return if op.arity() == 1 { + Some(AstNode::UnaryExpression(*op, right)) + } else if let Some(left) = parse_expression(&tokens[..lowest_index]) { + Some(AstNode::BinaryExpression(Box::new(left), *op, right)) + } else { + None + }; + } else { + parse_atom(tokens) + } +} + +/// Returns the index of the rightmost operator with the lowest precedence +fn lowest_op(tokens: &[Token]) -> Option { + let mut lowest = i32::MAX; + let mut lowest_index = 0; + let mut depth = 0; + for (i, token) in tokens.iter().enumerate() { + match token { + Token::Operator(op) => { + // <= because we want the rightmost operator + if depth == 0 && op.precedence() <= lowest { + lowest = op.precedence(); + lowest_index = i; + } } + Token::OpenParen => depth += 1, + Token::CloseParen => depth -= 1, + _ => {} } - Some(next) + } + if lowest == i32::MAX { None } else { Some(lowest_index) } +} + +fn parse_atom(tokens: &[Token]) -> Option { + let token = tokens.first(); + if let Some(token) = token { + let result = match token { + Token::Number(n) => Some(AstNode::Number(n.clone())), + Token::Identifier(v) => Some(AstNode::Variable(v.clone())), + Token::OpenParen => { + let mut depth = 1; + let mut i = 1; + while depth > 0 { + match tokens[i] { + Token::OpenParen => depth += 1, + Token::CloseParen => depth -= 1, + _ => {} + } + i += 1; + } + parse_expression(&tokens[1..i - 1]) + } + _ => None + }; + result } else { None } } -fn try_parse_literal(tokens: &[Token], index: &mut usize) -> Option { - let token = &tokens[*index]; - let result = match token { - Token::Number(n) => Some(AstNode::Number(*n)), - Token::Identifier(v) => Some(AstNode::Variable(v.clone())), - _ => None - }; - if result.is_some() { - *index += 1; +#[derive(Debug, Clone)] +pub enum AstNode { + Number(BigInt), + Variable(String), + UnaryExpression(Operator, Box), + BinaryExpression(Box, Operator, Box), +} + +impl AstNode { + pub fn walk(&self, f: &dyn Fn(&AstNode)) -> AstNode { + f(self); + match self { + AstNode::Number(_) => self.clone(), + AstNode::Variable(_) => self.clone(), + AstNode::UnaryExpression(op, node) => + AstNode::UnaryExpression(*op, Box::new(node.walk(f))), + AstNode::BinaryExpression(left, op, right) => + AstNode::BinaryExpression( + Box::new(left.walk(f)), + *op, + Box::new(right.walk(f)), + ) + } } - result } \ No newline at end of file From e8ebe3925ab27ef41756ce6eb7b319d5e6409e1c Mon Sep 17 00:00:00 2001 From: Seggan Date: Tue, 6 Dec 2022 18:12:00 -0500 Subject: [PATCH 5/5] Remove exponentiation --- src/parsing/operator.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/parsing/operator.rs b/src/parsing/operator.rs index d857b0e..9919953 100644 --- a/src/parsing/operator.rs +++ b/src/parsing/operator.rs @@ -7,7 +7,6 @@ pub enum Operator { Multiply, Divide, Modulo, - Exponent, Negate, Equal, @@ -42,7 +41,6 @@ impl Operator { '*' => Some(Operator::Multiply), '/' => Some(Operator::Divide), '%' => Some(Operator::Modulo), - '^' => Some(Operator::Exponent), '¯' => Some(Operator::Negate), '=' => Some(Operator::Equal), @@ -93,8 +91,6 @@ impl Operator { Operator::Multiply | Operator::Divide | Operator::Modulo => 5, - Operator::Exponent => 6, - Operator::Not | Operator::Negate | Operator::Size => 7 } }