In [1]:
import re

def tokenize(code):
    """
    Tokenizes a given string of code.

    Args:
        code: The input code string.

    Returns:
        A list of tokens. Each token is a tuple containing the token type and its value.
    """
    tokens = []

    # Regular expressions for different token types
    keywords = r'(if|else|while|for|int|float|string|return)'
    operators = r'(\+|-|\*|/|=|<|>|<=|>=|==|!=)'
    identifiers = r'[a-zA-Z_][a-zA-Z0-9_]*'
    numbers = r'\d+(\.\d+)?'
    strings = r'"[^"]*"' # Matches strings enclosed in double quotes
    separators = r'(\(|\)|\[|\]|\{|\}|,|;)'
    whitespace = r'\s+'

    # Combine all patterns into a single regular expression
    pattern = '|'.join([keywords, operators, identifiers, numbers, strings, separators, whitespace])

    for match in re.finditer(pattern, code):
        token_value = match.group(0)

        if re.match(keywords, token_value):
            token_type = 'KEYWORD'
        elif re.match(operators, token_value):
            token_type = 'OPERATOR'
        elif re.match(identifiers, token_value):
            token_type = 'IDENTIFIER'
        elif re.match(numbers, token_value):
            token_type = 'NUMBER'
        elif re.match(strings, token_value):
            token_type = 'STRING'
        elif re.match(separators, token_value):
            token_type = 'SEPARATOR'
        elif re.match(whitespace, token_value):
            continue # Skip whitespace
        else:
            token_type = 'INVALID'

        tokens.append((token_type, token_value))

    return tokens

code = """
int main() {
  int x = 10;
  if (x > 5) {
    x = x + 1;
  }
  return 0;
}
"""

tokens = tokenize(code)

for token_type, token_value in tokens:
  print(f"Token: {token_value}, Type: {token_type}")


Token: int, Type: KEYWORD
Token: main, Type: IDENTIFIER
Token: (, Type: SEPARATOR
Token: ), Type: SEPARATOR
Token: {, Type: SEPARATOR
Token: int, Type: KEYWORD
Token: x, Type: IDENTIFIER
Token: =, Type: OPERATOR
Token: 10, Type: NUMBER
Token: ;, Type: SEPARATOR
Token: if, Type: KEYWORD
Token: (, Type: SEPARATOR
Token: x, Type: IDENTIFIER
Token: >, Type: OPERATOR
Token: 5, Type: NUMBER
Token: ), Type: SEPARATOR
Token: {, Type: SEPARATOR
Token: x, Type: IDENTIFIER
Token: =, Type: OPERATOR
Token: x, Type: IDENTIFIER
Token: +, Type: OPERATOR
Token: 1, Type: NUMBER
Token: ;, Type: SEPARATOR
Token: }, Type: SEPARATOR
Token: return, Type: KEYWORD
Token: 0, Type: NUMBER
Token: ;, Type: SEPARATOR
Token: }, Type: SEPARATOR
