Skip to content

Commit

Permalink
url/url: Add URL parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Zer0-One committed Feb 9, 2023
1 parent 159409e commit 60152a2
Show file tree
Hide file tree
Showing 5 changed files with 335 additions and 1 deletion.
2 changes: 2 additions & 0 deletions url/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ cc_library(
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = [
"//util:base_parser",
"//util:string",
"//util:uuid",
"@spdlog",
],
)

Expand Down
250 changes: 250 additions & 0 deletions url/url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
#include "util/string.h"
#include "util/uuid.h"

#include <spdlog/spdlog.h>

#include <array>
#include <cstdint>
#include <optional>
#include <string>
#include <variant>

Expand Down Expand Up @@ -51,4 +54,251 @@ std::string blob_url_create(Origin const &origin) {
return result;
}

//To-Do(dzero): Maybe log the parser state, the current buffer, and the current input position?
void UrlParser::validation_error(std::string err){
if(!err.empty()){
spdlog::debug("Validation Error: {}", err);
}
}

// https://url.spec.whatwg.org/#concept-url-parser
std::optional<Url> UrlParser::parse(std::string input, std::optional<Url> base, std::optional<std::string> encoding){
std::optional<Url> url = parse_basic(input, base, encoding, std::nullopt, std::nullopt);

if(url.has_value() && url.value().scheme == "blob"){
// To-Do: Resolve blob URL
}

return url;
}

// https://url.spec.whatwg.org/#concept-basic-url-parser
std::optional<Url> UrlParser::parse_basic(std::string input, std::optional<Url> base, std::optional<std::string> encoding, std::optional<Url> url, std::optional<ParserState> state_override){
base_ = base;
state_override_ = state_override;

// If url is not given
if(!url.has_value()){
// Set url to a new URL
url_ = Url();

bool leading_trailing_c0 = false;

// Remove any leading or trailing C0 control or space from input
while(util::is_c0_or_space(input.front())){
input.erase(0, 1);

leading_trailing_c0 = true;
}

while(util::is_c0_or_space(input.back())){
input.pop_back();

leading_trailing_c0 = true;
}

// If input contains any leading or trailing C0 control or space, validation error
if(leading_trailing_c0){
validation_error("Input contains leading or trailing C0 control or space characters");
}
}
else{
url_ = url.value();
}

bool tab_newline = false;

// Remove all ASCII tab or newline from input
for(auto i = input.begin(); i != input.end();){
if(util::is_tab_or_newline(*i)){
i = input.erase(i);

tab_newline = true;

continue;
}

i++;
}

// If input contains any ASCII tab or newline, validation error
if(tab_newline){
validation_error("Input contains ASCII tab or newline");
}

// Let state be state override if given, or scheme start state otherwise
state = state_override_.value_or(ParserState::SchemeStart);

// To-Do(zero-one): Set encoding to the result of getting an output encoding from encoding

buffer = "";

atSignSeen = false;
insideBrackets = false;
passwordTokenSeen = false;

// Initialize BaseParser with our modified input
reset(input);

while(!is_eof()){
switch(state){
case ParserState::SchemeStart:
state_scheme_start();
break;
case ParserState::Scheme:
state_scheme();
break;
case ParserState::NoScheme:
state_no_scheme();
break;
case ParserState::Failure:
return std::nullopt;
case ParserState::Terminate:
// I use this state where the spec returns "nothing" (i.e, the parser is modifying a given optional URL)
// Instead of modifying it in-place, I modify a copy and return that instead of nothing.
return url_;
default:
return url_;
}

advance(1);
}

if(base_.has_value() || encoding.has_value()){
return std::nullopt;
}

return std::nullopt;
}

// https://url.spec.whatwg.org/#scheme-start-state
void UrlParser::state_scheme_start(){
if(util::is_alpha(peek())){
buffer += util::lowercased(peek());

state = ParserState::Scheme;
}
else if(!state_override_.has_value()){
state = ParserState::NoScheme;

// This can underflow pos_; that's ok, because it's incremented again before it's ever used.
back(1);
}
else{
validation_error("c is not an alpha and no state override was given");

state = ParserState::Failure;
}
}

// https://url.spec.whatwg.org/#scheme-state
void UrlParser::state_scheme(){
if(util::is_alphanumeric(peek()) || peek() == '+' || peek() == '-' || peek() == '.'){
buffer += util::lowercased(peek());
}
else if(peek() == ':'){
if(state_override_.has_value()){
if(special_schemes.contains(url_.scheme) && !special_schemes.contains(buffer)){
state = ParserState::Terminate;

return;
}
if(!special_schemes.contains(url_.scheme) && special_schemes.contains(buffer)){
state = ParserState::Terminate;

return;
}
if((!url_.user.empty() || !url_.passwd.empty() || url_.port.has_value()) && buffer == "file"){
state = ParserState::Terminate;

return;
}
if(url_.scheme == "file" && url_.host.has_value() && url_.host.value().type == HostType::Empty){
state = ParserState::Terminate;

return;
}
}

url_.scheme = buffer;

if(state_override_.has_value()){
if(special_schemes.contains(url_.scheme) && url_.port.has_value() && url_.port.value() == special_schemes.at(url_.scheme)){
url_.port.reset();
}

state = ParserState::Terminate;

return;
}

buffer = "";

if(url_.scheme == "file"){
if(!remaining().starts_with("//")){
validation_error("Remaining does not start with '//'");
}

state = ParserState::File;
}
else if(special_schemes.contains(url_.scheme) && base_.has_value() && base_.value().scheme == url_.scheme){
state = ParserState::SpecialRelativeOrAuthority;
}
else if(special_schemes.contains(url_.scheme)){
state = ParserState::SpecialAuthoritySlashes;
}
else if(remaining().starts_with('/')){
state = ParserState::PathOrAuthority;

advance(1);
}
else{
url_.path = "";

state = ParserState::OpaquePath;
}
}
else if(!state_override_.has_value()){
buffer = "";

state = ParserState::NoScheme;

reset();
}
else{
validation_error("c is not an alphanumeric, '+', '-', '.', or ':', and no state override was given");

state = ParserState::Failure;
}
}

// https://url.spec.whatwg.org/#no-scheme-state
void UrlParser::state_no_scheme(){
// Note: A URL path is defined as either an ASCII string, or an array of ASCII
// strings. So, I take "opaque path" to mean that the path is not empty/null.
if(!base_.has_value() || (!base_.value().path.empty() && peek() != '#')){
validation_error("Base is null, or base has an opaque path and c is not '#'");

state = ParserState::Failure;
}
else if(!base_.value().path.empty() && peek() == '#'){
url_.scheme = base_.value().scheme;
url_.path = base_.value().path;
url_.query = base_.value().query;
url_.fragment = "";

state = ParserState::Fragment;
}
else if(base_.value().scheme != "file"){
state = ParserState::Relative;

back(1);
}
else{
state = ParserState::File;

back(1);
}
}

} // namespace url
73 changes: 72 additions & 1 deletion url/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
#ifndef URL_URL_H_
#define URL_URL_H_

#include "util/base_parser.h"

#include <array>
#include <cstdint>
#include <map>
#include <optional>
#include <string>
#include <string_view>
#include <variant>

namespace url {
Expand All @@ -31,10 +35,77 @@ struct Origin {
};

/**
* Generates a new Blob URL for the given origin
* Generates a new Blob URL for the given origin.
*/
std::string blob_url_create(Origin const &origin);

struct Url {
std::string scheme;
std::string user;
std::string passwd;
std::optional<Host> host;
std::optional<std::uint16_t> port;
std::string path;
std::optional<std::string> query;
std::optional<std::string> fragment;
};

class UrlParser final : util::BaseParser {
public:
UrlParser() : BaseParser{""} {}

std::optional<Url> parse(std::string input, std::optional<Url> base = std::nullopt, std::optional<std::string> encoding = "UTF8");

private:
enum class ParserState {
SchemeStart,
Scheme,
NoScheme,
SpecialRelativeOrAuthority,
PathOrAuthority,
Relative,
RelativeSlash,
SpecialAuthoritySlashes,
SpecialAuthorityIgnoreSlashes,
Authority,
Host,
Hostname,
Port,
File,
FileSlash,
FileHost,
PathStart,
Path,
OpaquePath,
Query,
Fragment,
Failure,
Terminate
};

const std::map<std::string, std::uint16_t> special_schemes = {
{"ftp", 21}, {"file", 0}, {"http", 80}, {"https", 443}, {"ws", 80}, {"wss", 443}};

std::optional<Url> parse_basic(std::string input, std::optional<Url> base, std::optional<std::string> encoding, std::optional<Url> url, std::optional<ParserState> state_override);
void state_scheme_start();
void state_scheme();
void state_no_scheme();

void validation_error(std::string err);

Url url_;
std::optional<Url> base_;
std::optional<ParserState> state_override_;

ParserState state;

std::string buffer;

bool atSignSeen;
bool insideBrackets;
bool passwordTokenSeen;
};

} // namespace url

#endif
9 changes: 9 additions & 0 deletions url/url_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <array>
#include <cstdint>
#include <iostream>
#include <optional>
#include <regex>

using etest::expect;
Expand Down Expand Up @@ -45,5 +46,13 @@ int main() {
blob, std::regex("blob:https://\\[2001:db8:85a3::8a2e:370:7334\\]:8080/" + REGEX_UUID)));
});

etest::test("URL parsing", [] {
url::UrlParser p;
std::optional<url::Url> url = p.parse("https://example.com:8080/index.html");

etest::expect(url.has_value());
etest::expect(url.value().scheme == "https");
});

return etest::run_all_tests();
}
2 changes: 2 additions & 0 deletions util/base_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class BaseParser {

constexpr std::string_view peek(std::size_t chars) const { return input_.substr(pos_, chars); }

constexpr std::string_view remaining() const { return input_.substr(pos_ + 1); }

constexpr bool starts_with(std::string_view prefix) const { return peek(prefix.size()) == prefix; }

constexpr bool is_eof() const { return pos_ >= input_.size(); }
Expand Down

0 comments on commit 60152a2

Please sign in to comment.