From 6b2d26727e606598c405765cb4e1e1875ce12921 Mon Sep 17 00:00:00 2001
From: Damon Harris <TheDcoder@protonmail.com>
Date: Sun, 5 Jul 2020 01:07:09 +0530
Subject: [PATCH] Implement tokenizer

---
 CMakeLists.txt |   4 +-
 eci.c          |  11 +-
 parse.c        | 281 +++++++++++++++++++++++++++++++++++++++++++++++++
 parse.h        |  59 +++++++++++
 utils.c        |  13 +++
 utils.h        |   5 +
 6 files changed, 363 insertions(+), 10 deletions(-)
 create mode 100644 parse.c
 create mode 100644 parse.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a0bf6b..9796979 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
 
 # Define the project and executable
 project(EasyCodeIt C)
-add_executable(eci eci.c utils.c)
+add_executable(eci utils.c parse.c eci.c)
 
 # Enable warnings
 if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
-	add_compile_options(-Wall -Wno-maybe-uninitialized -Wno-parentheses -Wpedantic)
+	add_compile_options(-Wall -Wpedantic -Wextra -Wshadow -Wno-maybe-uninitialized -Wno-parentheses)
 endif()
diff --git a/eci.c b/eci.c
index aec9a59..59d3447 100644
--- a/eci.c
+++ b/eci.c
@@ -21,12 +21,7 @@
 #include <stdlib.h>
 #include <stdnoreturn.h>
 #include "utils.h"
-
-noreturn void die(char *msg) {
-	fputs(msg, stderr);
-	if (*msg != '\0') fputs("\n", stderr);
-	exit(EXIT_FAILURE);
-}
+#include "parse.h"
 
 int main(int argc, char *argv[]) {
 	if (argc < 2) die("No arguments!");
@@ -39,8 +34,8 @@ int main(int argc, char *argv[]) {
 	char *code = readfile(source_file);
 	if (!code) die("Failed to read from source file!");
 	
-	// Output the code
-	fputs(code, stdout);
+	// Parse the code
+	parse(code);
 	
 	// Free the resources
 	free(code);
diff --git a/parse.c b/parse.c
new file mode 100644
index 0000000..6bd2743
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,281 @@
+/* 
+ * This file is part of EasyCodeIt.
+ * 
+ * Copyright (C) 2020 TheDcoder <TheDcoder@protonmail.com>
+ * 
+ * EasyCodeIt is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <strings.h>
+#include "parse.h"
+#include "utils.h"
+
+const char CHR_COMMENT = ';';
+const char CHR_DIRECTIVE = '#';
+const char CHR_MACRO = '@';
+const char CHR_VARIABLE = '$';
+const char CHR_DOT = '.';
+const char CHR_COMMA = ',';
+
+char CHRSET_WHITESPACE[] = {' ', '\t', '\n'};
+char CHRSET_QUOTE[] = {'\'', '"'};
+char CHRSET_OPERATOR[] = {
+	'+', '-', '*', '/', '^',
+	'&',
+	'=', '<', '>',
+	'?', ':',
+};
+char CHRSET_OPERATOR_EQUABLE[] = {'+', '-', '*', '/', '^', '&', '='};
+char CHRSET_BRACKET[] = {'[', ']', '(', ')'};
+
+char STRING_CS[] = "cs";
+char STRING_CE[] = "ce";
+char STRING_COMMENT_START[] = "comments-start";
+char STRING_COMMENT_END[] = "comments-end";
+
+struct TokenCharMapElem {
+	enum TokenType type;
+	union {
+		const char chr;
+		const char *chr_arr;
+	};
+};
+
+static void print_token(struct Token *token) {
+	puts("---### TOKEN ###---");
+	char *token_type;
+	switch (token->type) {
+		case TOK_UNKNOWN:
+			token_type = "Unknown";
+			break;
+		case TOK_WHITESPACE:
+			token_type = "Whitespace";
+			break;
+		case TOK_COMMENT:
+			token_type = "Comment";
+			break;
+		case TOK_DIRECTIVE:
+			token_type = "Directive";
+			break;
+		case TOK_NUMBER:
+			token_type = "Number";
+			break;
+		case TOK_STRING:
+			token_type = "String";
+			break;
+		case TOK_WORD:
+			token_type = "Word";
+			break;
+		case TOK_MACRO:
+			token_type = "Macro";
+			break;
+		case TOK_VARIABLE:
+			token_type = "Variable";
+			break;
+		case TOK_OPERATOR:
+			token_type = "Operator";
+			break;
+		case TOK_BRACKET:
+			token_type = "Bracket";
+			break;
+		case TOK_DOT:
+			token_type = "Dot";
+			break;
+		case TOK_COMMA:
+			token_type = "Comma";
+			break;
+		default:
+			token_type = "Unnamed";
+			break;
+	}
+	fputs("Type: ", stdout);
+	puts(token_type);
+	fputs("Data: ", stdout);
+	for (size_t c = 0; c < token->data_len; c++) putchar(token->data[c]);
+	putchar('\n');
+}
+
+void parse(char *code) {
+	while (true) {
+		struct Token token = token_get(code, &code);
+		if (!code) break;
+		if (token.type != TOK_WHITESPACE) print_token(&token);
+		if (token.type == TOK_UNKNOWN) die("!!! Unknown token encountered !!!");
+	}
+	return;
+}
+
+struct Token token_get(char *code, char **next) {
+	struct Token token = {
+		.type = TOK_UNKNOWN,
+		.data = NULL,
+		.data_len = 0,
+	};
+	size_t length;
+	char *next_code = NULL;
+	
+	// Identify the token
+	if (length = scan_string(code, char_is_whitespace)) {
+		// Whitespace
+		token.type = TOK_WHITESPACE;
+		token.data = code;
+		token.data_len = length;
+	} else if (*code == CHR_COMMENT || *code == CHR_DIRECTIVE) {
+		// Comment or Directive
+		token.type = *code == CHR_COMMENT ? TOK_COMMENT : TOK_DIRECTIVE;
+		token.data = ++code;
+		token.data_len = scan_string(code, char_is_not_eol);
+		
+		// Check if this is a multi-line comment
+		bool multiline_comment = false;
+		if (token.type == TOK_DIRECTIVE) {
+			bool match_long, match_short;
+			match_short = strncasecmp(STRING_CS, code, (sizeof STRING_CS) - 1) == 0;
+			if (!match_short) match_long = strncasecmp(STRING_COMMENT_START, code, (sizeof STRING_COMMENT_START) - 1) == 0;
+			// Make sure we have a whitespace after the directive
+			char *comment_start;
+			if (match_long || match_short) {
+				comment_start = code + (match_long ? sizeof STRING_COMMENT_START : sizeof STRING_CS);
+				multiline_comment = char_is_whitespace(comment_start[-1]);
+			}
+			if (multiline_comment) {
+				token.type = TOK_COMMENT;
+				token.data = code = comment_start;
+			}
+		}
+		
+		if (multiline_comment) {
+			// Scan for the ending directive token
+			char *comment_end;
+			while (true) {
+				code += scan_string(code, char_is_not_eol) + 1;
+				if (*code == '\0') break;
+				if (*code != CHR_DIRECTIVE) continue;
+				
+				bool match_long, match_short, match = false;
+				++code;
+				match_short = strncasecmp(STRING_CE, code, (sizeof STRING_CE) - 1) == 0;
+				if (!match_short) match_long = strncasecmp(STRING_COMMENT_END, code, (sizeof STRING_COMMENT_END) - 1) == 0;
+				// Make sure we have a whitespace after the directive
+				if (match_long || match_short) {
+					comment_end = code + ((match_long ? sizeof STRING_COMMENT_END : sizeof STRING_CE) - 1);
+					match = char_is_whitespace(*comment_end);
+				}
+				if (match) break;
+			}
+			token.data_len = (code - token.data) - 1;
+			next_code = comment_end;
+		} else {
+			token.data_len = scan_string(code, char_is_not_eol);
+		}
+	} else if (length = scan_string(code, char_is_num)){
+		// Number
+		token.type = TOK_NUMBER;
+		token.data = code;
+		token.data_len = length;
+	} else if (chrcmp(*code, CHRSET_QUOTE, sizeof CHRSET_QUOTE)) {
+		// String
+		token.type = TOK_STRING;
+		const char quote = *code;
+		token.data = code + 1;
+		for (token.data_len = 0; token.data[token.data_len] != quote; ++token.data_len);
+		next_code = token.data + token.data_len + 1;
+	} else if (length = scan_string(code, char_is_alphanum)){
+		// Word
+		token.type = TOK_WORD;
+		token.data = code;
+		token.data_len = length;
+	} else if (*code == CHR_MACRO || *code == CHR_VARIABLE){
+		// Macro or Variable
+		token.type = *code == CHR_MACRO ? TOK_MACRO : TOK_VARIABLE;
+		token.data = ++code;
+		token.data_len = scan_string(code, char_is_alphanum);
+	} else if (char_is_opsym(*code)) {
+		// Operator
+		token.type = TOK_OPERATOR;
+		token.data = code;
+		
+		// Include the trailing `=` if possible
+		token.data_len = code[1] == '=' && chrcmp(*code, CHRSET_OPERATOR_EQUABLE, sizeof CHRSET_OPERATOR_EQUABLE) ? 2 : 1;
+	} else if (char_is_bracket(*code)) {
+		// Bracket (Parenthesis)
+		token.type = TOK_BRACKET;
+		token.data = code;
+		token.data_len = 1;
+	} else if (*code == CHR_DOT) {
+		// Dot (Full Stop)
+		token.type = TOK_DOT;
+		token.data = code;
+		token.data_len = 1;
+	} else if (*code == CHR_COMMA) {
+		// Comma
+		token.type = TOK_COMMA;
+		token.data = code;
+		token.data_len = 1;
+	} else {
+		// Unknown
+		token.data = code;
+		token.data_len = 1;
+	}
+	
+	// Set the next code
+	if (next_code) {
+		*next = *next_code == '\0' ? NULL : next_code;
+	} else {
+		*next = *code == '\0' ? NULL : code + token.data_len;
+	}
+	
+	// Return the token
+	return token;
+}
+
+size_t scan_string(char *str, bool (cmpfunc)(char)) {
+	size_t len = 0;
+	while (true) {
+		if (!cmpfunc(*str)) break;
+		++len; ++str;
+	}
+	return len;
+}
+
+bool char_is_whitespace(char chr) {
+	return chrcmp(chr, CHRSET_WHITESPACE, sizeof CHRSET_WHITESPACE);
+}
+
+bool char_is_alpha(char chr) {
+	return isalpha(chr);
+}
+
+bool char_is_num(char chr) {
+	return isdigit(chr);
+}
+
+bool char_is_alphanum(char chr) {
+	return char_is_alpha(chr) || char_is_num(chr) || chr == '_';
+}
+
+bool char_is_opsym(char chr) {
+	return chrcmp(chr, CHRSET_OPERATOR, sizeof CHRSET_OPERATOR);
+}
+
+bool char_is_bracket(char chr) {
+	return chrcmp(chr, CHRSET_BRACKET, sizeof CHRSET_BRACKET);
+}
+
+bool char_is_not_eol(char chr) {
+	return chr != '\n' && chr != '\0';
+}
diff --git a/parse.h b/parse.h
new file mode 100644
index 0000000..090c925
--- /dev/null
+++ b/parse.h
@@ -0,0 +1,59 @@
+/* 
+ * This file is part of EasyCodeIt.
+ * 
+ * Copyright (C) 2020 TheDcoder <TheDcoder@protonmail.com>
+ * 
+ * EasyCodeIt is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef PARSE_H
+#define PARSE_H
+
+#include <stddef.h>
+
+enum TokenType {
+	TOK_UNKNOWN,
+	TOK_WHITESPACE,
+	TOK_COMMENT,
+	TOK_DIRECTIVE,
+	TOK_NUMBER,
+	TOK_STRING,
+	TOK_WORD,
+	TOK_MACRO,
+	TOK_VARIABLE,
+	TOK_OPERATOR,
+	TOK_BRACKET,
+	TOK_DOT,
+	TOK_COMMA,
+};
+
+struct Token {
+	enum TokenType type;
+	char *data;
+	size_t data_len;
+};
+
+void parse(char *code);
+struct Token token_get(char *code, char **next);
+size_t scan_string(char *str, bool (cmpfunc)(char));
+
+bool char_is_whitespace(char chr);
+bool char_is_alpha(char chr);
+bool char_is_num(char chr);
+bool char_is_alphanum(char chr);
+bool char_is_opsym(char chr);
+bool char_is_bracket(char chr);
+bool char_is_not_eol(char chr);
+
+#endif
diff --git a/utils.c b/utils.c
index e510ab0..4d0564e 100644
--- a/utils.c
+++ b/utils.c
@@ -20,9 +20,22 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdnoreturn.h>
 #include <string.h>
 #include "utils.h"
 
+bool chrcmp(char chr, char *arr, size_t arr_len) {
+	bool present;
+	for (size_t i = 0; i < arr_len; ++i) if (present = chr == arr[i]) break;
+	return present;
+}
+
+noreturn void die(char *msg) {
+	fputs(msg, stderr);
+	if (*msg != '\0') fputs("\n", stderr);
+	exit(EXIT_FAILURE);
+}
+
 char *readfile(FILE *file) {
 	// Define the final buffer
 	char *final_buffer = NULL;
diff --git a/utils.h b/utils.h
index 409c944..db8f202 100644
--- a/utils.h
+++ b/utils.h
@@ -20,6 +20,9 @@
 #ifndef UTILS_H
 #define UTILS_H
 
+#include <stdbool.h>
+#include <stdnoreturn.h>
+
 #ifndef READ_FILE_BUFFER_SIZE
 #define READ_FILE_BUFFER_SIZE 1024
 #endif
@@ -30,6 +33,8 @@ struct ReadFileBufferNode {
 	struct ReadFileBufferNode *next;
 };
 
+bool chrcmp(char chr, char *arr, size_t arr_len);
+noreturn void die(char *msg);
 char *readfile(FILE *file);
 
 #endif