From 6b2d26727e606598c405765cb4e1e1875ce12921 Mon Sep 17 00:00:00 2001 From: Damon Harris Date: Sun, 5 Jul 2020 01:07:09 +0530 Subject: [PATCH] Implement tokenizer --- CMakeLists.txt | 4 +- eci.c | 11 +- parse.c | 281 +++++++++++++++++++++++++++++++++++++++++++++++++ parse.h | 59 +++++++++++ utils.c | 13 +++ utils.h | 5 + 6 files changed, 363 insertions(+), 10 deletions(-) create mode 100644 parse.c create mode 100644 parse.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a0bf6b..9796979 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0) # Define the project and executable project(EasyCodeIt C) -add_executable(eci eci.c utils.c) +add_executable(eci utils.c parse.c eci.c) # Enable warnings if(CMAKE_C_COMPILER_ID STREQUAL "GNU") - add_compile_options(-Wall -Wno-maybe-uninitialized -Wno-parentheses -Wpedantic) + add_compile_options(-Wall -Wpedantic -Wextra -Wshadow -Wno-maybe-uninitialized -Wno-parentheses) endif() diff --git a/eci.c b/eci.c index aec9a59..59d3447 100644 --- a/eci.c +++ b/eci.c @@ -21,12 +21,7 @@ #include #include #include "utils.h" - -noreturn void die(char *msg) { - fputs(msg, stderr); - if (*msg != '\0') fputs("\n", stderr); - exit(EXIT_FAILURE); -} +#include "parse.h" int main(int argc, char *argv[]) { if (argc < 2) die("No arguments!"); @@ -39,8 +34,8 @@ int main(int argc, char *argv[]) { char *code = readfile(source_file); if (!code) die("Failed to read from source file!"); - // Output the code - fputs(code, stdout); + // Parse the code + parse(code); // Free the resources free(code); diff --git a/parse.c b/parse.c new file mode 100644 index 0000000..6bd2743 --- /dev/null +++ b/parse.c @@ -0,0 +1,281 @@ +/* + * This file is part of EasyCodeIt. + * + * Copyright (C) 2020 TheDcoder + * + * EasyCodeIt is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include "parse.h" +#include "utils.h" + +const char CHR_COMMENT = ';'; +const char CHR_DIRECTIVE = '#'; +const char CHR_MACRO = '@'; +const char CHR_VARIABLE = '$'; +const char CHR_DOT = '.'; +const char CHR_COMMA = ','; + +char CHRSET_WHITESPACE[] = {' ', '\t', '\n'}; +char CHRSET_QUOTE[] = {'\'', '"'}; +char CHRSET_OPERATOR[] = { + '+', '-', '*', '/', '^', + '&', + '=', '<', '>', + '?', ':', +}; +char CHRSET_OPERATOR_EQUABLE[] = {'+', '-', '*', '/', '^', '&', '='}; +char CHRSET_BRACKET[] = {'[', ']', '(', ')'}; + +char STRING_CS[] = "cs"; +char STRING_CE[] = "ce"; +char STRING_COMMENT_START[] = "comments-start"; +char STRING_COMMENT_END[] = "comments-end"; + +struct TokenCharMapElem { + enum TokenType type; + union { + const char chr; + const char *chr_arr; + }; +}; + +static void print_token(struct Token *token) { + puts("---### TOKEN ###---"); + char *token_type; + switch (token->type) { + case TOK_UNKNOWN: + token_type = "Unknown"; + break; + case TOK_WHITESPACE: + token_type = "Whitespace"; + break; + case TOK_COMMENT: + token_type = "Comment"; + break; + case TOK_DIRECTIVE: + token_type = "Directive"; + break; + case TOK_NUMBER: + token_type = "Number"; + break; + case TOK_STRING: + token_type = "String"; + break; + case TOK_WORD: + token_type = "Word"; + break; + case TOK_MACRO: + token_type = "Macro"; + break; + case TOK_VARIABLE: + token_type = "Variable"; + break; + case TOK_OPERATOR: + token_type = "Operator"; + break; + case TOK_BRACKET: + token_type = "Bracket"; + break; + case TOK_DOT: + token_type = "Dot"; + break; + case TOK_COMMA: + token_type = "Comma"; + break; + default: + token_type = "Unnamed"; + break; + } + fputs("Type: ", stdout); + puts(token_type); + fputs("Data: ", stdout); + for (size_t c = 0; c < token->data_len; c++) putchar(token->data[c]); + putchar('\n'); +} + +void parse(char *code) { + while (true) { + struct Token token = token_get(code, &code); + if (!code) break; + if (token.type != TOK_WHITESPACE) print_token(&token); + if (token.type == TOK_UNKNOWN) die("!!! Unknown token encountered !!!"); + } + return; +} + +struct Token token_get(char *code, char **next) { + struct Token token = { + .type = TOK_UNKNOWN, + .data = NULL, + .data_len = 0, + }; + size_t length; + char *next_code = NULL; + + // Identify the token + if (length = scan_string(code, char_is_whitespace)) { + // Whitespace + token.type = TOK_WHITESPACE; + token.data = code; + token.data_len = length; + } else if (*code == CHR_COMMENT || *code == CHR_DIRECTIVE) { + // Comment or Directive + token.type = *code == CHR_COMMENT ? TOK_COMMENT : TOK_DIRECTIVE; + token.data = ++code; + token.data_len = scan_string(code, char_is_not_eol); + + // Check if this is a multi-line comment + bool multiline_comment = false; + if (token.type == TOK_DIRECTIVE) { + bool match_long, match_short; + match_short = strncasecmp(STRING_CS, code, (sizeof STRING_CS) - 1) == 0; + if (!match_short) match_long = strncasecmp(STRING_COMMENT_START, code, (sizeof STRING_COMMENT_START) - 1) == 0; + // Make sure we have a whitespace after the directive + char *comment_start; + if (match_long || match_short) { + comment_start = code + (match_long ? sizeof STRING_COMMENT_START : sizeof STRING_CS); + multiline_comment = char_is_whitespace(comment_start[-1]); + } + if (multiline_comment) { + token.type = TOK_COMMENT; + token.data = code = comment_start; + } + } + + if (multiline_comment) { + // Scan for the ending directive token + char *comment_end; + while (true) { + code += scan_string(code, char_is_not_eol) + 1; + if (*code == '\0') break; + if (*code != CHR_DIRECTIVE) continue; + + bool match_long, match_short, match = false; + ++code; + match_short = strncasecmp(STRING_CE, code, (sizeof STRING_CE) - 1) == 0; + if (!match_short) match_long = strncasecmp(STRING_COMMENT_END, code, (sizeof STRING_COMMENT_END) - 1) == 0; + // Make sure we have a whitespace after the directive + if (match_long || match_short) { + comment_end = code + ((match_long ? sizeof STRING_COMMENT_END : sizeof STRING_CE) - 1); + match = char_is_whitespace(*comment_end); + } + if (match) break; + } + token.data_len = (code - token.data) - 1; + next_code = comment_end; + } else { + token.data_len = scan_string(code, char_is_not_eol); + } + } else if (length = scan_string(code, char_is_num)){ + // Number + token.type = TOK_NUMBER; + token.data = code; + token.data_len = length; + } else if (chrcmp(*code, CHRSET_QUOTE, sizeof CHRSET_QUOTE)) { + // String + token.type = TOK_STRING; + const char quote = *code; + token.data = code + 1; + for (token.data_len = 0; token.data[token.data_len] != quote; ++token.data_len); + next_code = token.data + token.data_len + 1; + } else if (length = scan_string(code, char_is_alphanum)){ + // Word + token.type = TOK_WORD; + token.data = code; + token.data_len = length; + } else if (*code == CHR_MACRO || *code == CHR_VARIABLE){ + // Macro or Variable + token.type = *code == CHR_MACRO ? TOK_MACRO : TOK_VARIABLE; + token.data = ++code; + token.data_len = scan_string(code, char_is_alphanum); + } else if (char_is_opsym(*code)) { + // Operator + token.type = TOK_OPERATOR; + token.data = code; + + // Include the trailing `=` if possible + token.data_len = code[1] == '=' && chrcmp(*code, CHRSET_OPERATOR_EQUABLE, sizeof CHRSET_OPERATOR_EQUABLE) ? 2 : 1; + } else if (char_is_bracket(*code)) { + // Bracket (Parenthesis) + token.type = TOK_BRACKET; + token.data = code; + token.data_len = 1; + } else if (*code == CHR_DOT) { + // Dot (Full Stop) + token.type = TOK_DOT; + token.data = code; + token.data_len = 1; + } else if (*code == CHR_COMMA) { + // Comma + token.type = TOK_COMMA; + token.data = code; + token.data_len = 1; + } else { + // Unknown + token.data = code; + token.data_len = 1; + } + + // Set the next code + if (next_code) { + *next = *next_code == '\0' ? NULL : next_code; + } else { + *next = *code == '\0' ? NULL : code + token.data_len; + } + + // Return the token + return token; +} + +size_t scan_string(char *str, bool (cmpfunc)(char)) { + size_t len = 0; + while (true) { + if (!cmpfunc(*str)) break; + ++len; ++str; + } + return len; +} + +bool char_is_whitespace(char chr) { + return chrcmp(chr, CHRSET_WHITESPACE, sizeof CHRSET_WHITESPACE); +} + +bool char_is_alpha(char chr) { + return isalpha(chr); +} + +bool char_is_num(char chr) { + return isdigit(chr); +} + +bool char_is_alphanum(char chr) { + return char_is_alpha(chr) || char_is_num(chr) || chr == '_'; +} + +bool char_is_opsym(char chr) { + return chrcmp(chr, CHRSET_OPERATOR, sizeof CHRSET_OPERATOR); +} + +bool char_is_bracket(char chr) { + return chrcmp(chr, CHRSET_BRACKET, sizeof CHRSET_BRACKET); +} + +bool char_is_not_eol(char chr) { + return chr != '\n' && chr != '\0'; +} diff --git a/parse.h b/parse.h new file mode 100644 index 0000000..090c925 --- /dev/null +++ b/parse.h @@ -0,0 +1,59 @@ +/* + * This file is part of EasyCodeIt. + * + * Copyright (C) 2020 TheDcoder + * + * EasyCodeIt is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef PARSE_H +#define PARSE_H + +#include + +enum TokenType { + TOK_UNKNOWN, + TOK_WHITESPACE, + TOK_COMMENT, + TOK_DIRECTIVE, + TOK_NUMBER, + TOK_STRING, + TOK_WORD, + TOK_MACRO, + TOK_VARIABLE, + TOK_OPERATOR, + TOK_BRACKET, + TOK_DOT, + TOK_COMMA, +}; + +struct Token { + enum TokenType type; + char *data; + size_t data_len; +}; + +void parse(char *code); +struct Token token_get(char *code, char **next); +size_t scan_string(char *str, bool (cmpfunc)(char)); + +bool char_is_whitespace(char chr); +bool char_is_alpha(char chr); +bool char_is_num(char chr); +bool char_is_alphanum(char chr); +bool char_is_opsym(char chr); +bool char_is_bracket(char chr); +bool char_is_not_eol(char chr); + +#endif diff --git a/utils.c b/utils.c index e510ab0..4d0564e 100644 --- a/utils.c +++ b/utils.c @@ -20,9 +20,22 @@ #include #include #include +#include #include #include "utils.h" +bool chrcmp(char chr, char *arr, size_t arr_len) { + bool present; + for (size_t i = 0; i < arr_len; ++i) if (present = chr == arr[i]) break; + return present; +} + +noreturn void die(char *msg) { + fputs(msg, stderr); + if (*msg != '\0') fputs("\n", stderr); + exit(EXIT_FAILURE); +} + char *readfile(FILE *file) { // Define the final buffer char *final_buffer = NULL; diff --git a/utils.h b/utils.h index 409c944..db8f202 100644 --- a/utils.h +++ b/utils.h @@ -20,6 +20,9 @@ #ifndef UTILS_H #define UTILS_H +#include +#include + #ifndef READ_FILE_BUFFER_SIZE #define READ_FILE_BUFFER_SIZE 1024 #endif @@ -30,6 +33,8 @@ struct ReadFileBufferNode { struct ReadFileBufferNode *next; }; +bool chrcmp(char chr, char *arr, size_t arr_len); +noreturn void die(char *msg); char *readfile(FILE *file); #endif