Skip to content

Commit

Permalink
Implement tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
TheDcoder committed Jul 4, 2020
1 parent df204ff commit 6b2d267
Show file tree
Hide file tree
Showing 6 changed files with 363 additions and 10 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Expand Up @@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)

# Define the project and executable
project(EasyCodeIt C)
add_executable(eci eci.c utils.c)
add_executable(eci utils.c parse.c eci.c)

# Enable warnings
if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
add_compile_options(-Wall -Wno-maybe-uninitialized -Wno-parentheses -Wpedantic)
add_compile_options(-Wall -Wpedantic -Wextra -Wshadow -Wno-maybe-uninitialized -Wno-parentheses)
endif()
11 changes: 3 additions & 8 deletions eci.c
Expand Up @@ -21,12 +21,7 @@
#include <stdlib.h>
#include <stdnoreturn.h>
#include "utils.h"

noreturn void die(char *msg) {
fputs(msg, stderr);
if (*msg != '\0') fputs("\n", stderr);
exit(EXIT_FAILURE);
}
#include "parse.h"

int main(int argc, char *argv[]) {
if (argc < 2) die("No arguments!");
Expand All @@ -39,8 +34,8 @@ int main(int argc, char *argv[]) {
char *code = readfile(source_file);
if (!code) die("Failed to read from source file!");

// Output the code
fputs(code, stdout);
// Parse the code
parse(code);

// Free the resources
free(code);
Expand Down
281 changes: 281 additions & 0 deletions parse.c
@@ -0,0 +1,281 @@
/*
* This file is part of EasyCodeIt.
*
* Copyright (C) 2020 TheDcoder <TheDcoder@protonmail.com>
*
* EasyCodeIt is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <strings.h>
#include "parse.h"
#include "utils.h"

const char CHR_COMMENT = ';';
const char CHR_DIRECTIVE = '#';
const char CHR_MACRO = '@';
const char CHR_VARIABLE = '$';
const char CHR_DOT = '.';
const char CHR_COMMA = ',';

char CHRSET_WHITESPACE[] = {' ', '\t', '\n'};
char CHRSET_QUOTE[] = {'\'', '"'};
char CHRSET_OPERATOR[] = {
'+', '-', '*', '/', '^',
'&',
'=', '<', '>',
'?', ':',
};
char CHRSET_OPERATOR_EQUABLE[] = {'+', '-', '*', '/', '^', '&', '='};
char CHRSET_BRACKET[] = {'[', ']', '(', ')'};

char STRING_CS[] = "cs";
char STRING_CE[] = "ce";
char STRING_COMMENT_START[] = "comments-start";
char STRING_COMMENT_END[] = "comments-end";

struct TokenCharMapElem {
enum TokenType type;
union {
const char chr;
const char *chr_arr;
};
};

static void print_token(struct Token *token) {
puts("---### TOKEN ###---");
char *token_type;
switch (token->type) {
case TOK_UNKNOWN:
token_type = "Unknown";
break;
case TOK_WHITESPACE:
token_type = "Whitespace";
break;
case TOK_COMMENT:
token_type = "Comment";
break;
case TOK_DIRECTIVE:
token_type = "Directive";
break;
case TOK_NUMBER:
token_type = "Number";
break;
case TOK_STRING:
token_type = "String";
break;
case TOK_WORD:
token_type = "Word";
break;
case TOK_MACRO:
token_type = "Macro";
break;
case TOK_VARIABLE:
token_type = "Variable";
break;
case TOK_OPERATOR:
token_type = "Operator";
break;
case TOK_BRACKET:
token_type = "Bracket";
break;
case TOK_DOT:
token_type = "Dot";
break;
case TOK_COMMA:
token_type = "Comma";
break;
default:
token_type = "Unnamed";
break;
}
fputs("Type: ", stdout);
puts(token_type);
fputs("Data: ", stdout);
for (size_t c = 0; c < token->data_len; c++) putchar(token->data[c]);
putchar('\n');
}

void parse(char *code) {
while (true) {
struct Token token = token_get(code, &code);
if (!code) break;
if (token.type != TOK_WHITESPACE) print_token(&token);
if (token.type == TOK_UNKNOWN) die("!!! Unknown token encountered !!!");
}
return;
}

struct Token token_get(char *code, char **next) {
struct Token token = {
.type = TOK_UNKNOWN,
.data = NULL,
.data_len = 0,
};
size_t length;
char *next_code = NULL;

// Identify the token
if (length = scan_string(code, char_is_whitespace)) {
// Whitespace
token.type = TOK_WHITESPACE;
token.data = code;
token.data_len = length;
} else if (*code == CHR_COMMENT || *code == CHR_DIRECTIVE) {
// Comment or Directive
token.type = *code == CHR_COMMENT ? TOK_COMMENT : TOK_DIRECTIVE;
token.data = ++code;
token.data_len = scan_string(code, char_is_not_eol);

// Check if this is a multi-line comment
bool multiline_comment = false;
if (token.type == TOK_DIRECTIVE) {
bool match_long, match_short;
match_short = strncasecmp(STRING_CS, code, (sizeof STRING_CS) - 1) == 0;
if (!match_short) match_long = strncasecmp(STRING_COMMENT_START, code, (sizeof STRING_COMMENT_START) - 1) == 0;
// Make sure we have a whitespace after the directive
char *comment_start;
if (match_long || match_short) {
comment_start = code + (match_long ? sizeof STRING_COMMENT_START : sizeof STRING_CS);
multiline_comment = char_is_whitespace(comment_start[-1]);
}
if (multiline_comment) {
token.type = TOK_COMMENT;
token.data = code = comment_start;
}
}

if (multiline_comment) {
// Scan for the ending directive token
char *comment_end;
while (true) {
code += scan_string(code, char_is_not_eol) + 1;
if (*code == '\0') break;
if (*code != CHR_DIRECTIVE) continue;

bool match_long, match_short, match = false;
++code;
match_short = strncasecmp(STRING_CE, code, (sizeof STRING_CE) - 1) == 0;
if (!match_short) match_long = strncasecmp(STRING_COMMENT_END, code, (sizeof STRING_COMMENT_END) - 1) == 0;
// Make sure we have a whitespace after the directive
if (match_long || match_short) {
comment_end = code + ((match_long ? sizeof STRING_COMMENT_END : sizeof STRING_CE) - 1);
match = char_is_whitespace(*comment_end);
}
if (match) break;
}
token.data_len = (code - token.data) - 1;
next_code = comment_end;
} else {
token.data_len = scan_string(code, char_is_not_eol);
}
} else if (length = scan_string(code, char_is_num)){
// Number
token.type = TOK_NUMBER;
token.data = code;
token.data_len = length;
} else if (chrcmp(*code, CHRSET_QUOTE, sizeof CHRSET_QUOTE)) {
// String
token.type = TOK_STRING;
const char quote = *code;
token.data = code + 1;
for (token.data_len = 0; token.data[token.data_len] != quote; ++token.data_len);
next_code = token.data + token.data_len + 1;
} else if (length = scan_string(code, char_is_alphanum)){
// Word
token.type = TOK_WORD;
token.data = code;
token.data_len = length;
} else if (*code == CHR_MACRO || *code == CHR_VARIABLE){
// Macro or Variable
token.type = *code == CHR_MACRO ? TOK_MACRO : TOK_VARIABLE;
token.data = ++code;
token.data_len = scan_string(code, char_is_alphanum);
} else if (char_is_opsym(*code)) {
// Operator
token.type = TOK_OPERATOR;
token.data = code;

// Include the trailing `=` if possible
token.data_len = code[1] == '=' && chrcmp(*code, CHRSET_OPERATOR_EQUABLE, sizeof CHRSET_OPERATOR_EQUABLE) ? 2 : 1;
} else if (char_is_bracket(*code)) {
// Bracket (Parenthesis)
token.type = TOK_BRACKET;
token.data = code;
token.data_len = 1;
} else if (*code == CHR_DOT) {
// Dot (Full Stop)
token.type = TOK_DOT;
token.data = code;
token.data_len = 1;
} else if (*code == CHR_COMMA) {
// Comma
token.type = TOK_COMMA;
token.data = code;
token.data_len = 1;
} else {
// Unknown
token.data = code;
token.data_len = 1;
}

// Set the next code
if (next_code) {
*next = *next_code == '\0' ? NULL : next_code;
} else {
*next = *code == '\0' ? NULL : code + token.data_len;
}

// Return the token
return token;
}

size_t scan_string(char *str, bool (cmpfunc)(char)) {
size_t len = 0;
while (true) {
if (!cmpfunc(*str)) break;
++len; ++str;
}
return len;
}

bool char_is_whitespace(char chr) {
return chrcmp(chr, CHRSET_WHITESPACE, sizeof CHRSET_WHITESPACE);
}

bool char_is_alpha(char chr) {
return isalpha(chr);
}

bool char_is_num(char chr) {
return isdigit(chr);
}

bool char_is_alphanum(char chr) {
return char_is_alpha(chr) || char_is_num(chr) || chr == '_';
}

bool char_is_opsym(char chr) {
return chrcmp(chr, CHRSET_OPERATOR, sizeof CHRSET_OPERATOR);
}

bool char_is_bracket(char chr) {
return chrcmp(chr, CHRSET_BRACKET, sizeof CHRSET_BRACKET);
}

bool char_is_not_eol(char chr) {
return chr != '\n' && chr != '\0';
}
59 changes: 59 additions & 0 deletions parse.h
@@ -0,0 +1,59 @@
/*
* This file is part of EasyCodeIt.
*
* Copyright (C) 2020 TheDcoder <TheDcoder@protonmail.com>
*
* EasyCodeIt is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#ifndef PARSE_H
#define PARSE_H

#include <stddef.h>

enum TokenType {
TOK_UNKNOWN,
TOK_WHITESPACE,
TOK_COMMENT,
TOK_DIRECTIVE,
TOK_NUMBER,
TOK_STRING,
TOK_WORD,
TOK_MACRO,
TOK_VARIABLE,
TOK_OPERATOR,
TOK_BRACKET,
TOK_DOT,
TOK_COMMA,
};

struct Token {
enum TokenType type;
char *data;
size_t data_len;
};

void parse(char *code);
struct Token token_get(char *code, char **next);
size_t scan_string(char *str, bool (cmpfunc)(char));

bool char_is_whitespace(char chr);
bool char_is_alpha(char chr);
bool char_is_num(char chr);
bool char_is_alphanum(char chr);
bool char_is_opsym(char chr);
bool char_is_bracket(char chr);
bool char_is_not_eol(char chr);

#endif

0 comments on commit 6b2d267

Please sign in to comment.