forked from alokmenghrajani/opalang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsonLex.ml
161 lines (139 loc) · 5.59 KB
/
jsonLex.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
(*
Copyright © 2011 MLstate
This file is part of OPA.
OPA is free software: you can redistribute it and/or modify it under the
terms of the GNU Affero General Public License, version 3, as published by
the Free Software Foundation.
OPA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
more details.
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
module JP = JsonParse
module JT = JsonTypes
(** JSON parser based on syntax described on http://www.json.org/
This lexer uses Ulex, although there is no need for utf-8 support here,
because ocamllex created problems with OPA. *)
(* Types *)
(* TODOk1 - Review this... *)
let regexp t_blank = [' ' '\t' '\n' '\r']
let regexp t_digit = ['0'-'9']
let regexp t_digits = t_digit+
let regexp t_int = '0'| '-'? ['1'-'9'] t_digit*
let regexp t_frac = "." t_digits
let regexp t_e = ("e"|"E") ("+"|"-")?
let regexp t_exp = t_e t_digits
let regexp t_number = (t_int|'0') t_frac? t_exp?
let regexp t_hexa_digit = ['0'-'9''A'-'F''a'-'f']
let regexp t_hexa = t_hexa_digit t_hexa_digit t_hexa_digit t_hexa_digit
let regexp t_ident = ['a'-'z''A'-'Z']['a'-'z''A'-'Z''0'-'9''_']*
let b = Buffer.create 100000
(** Lexing rules *)
let rec get_token = lexer
(* End of stream / file *)
| eof -> JP.EOF
| '{' -> JP.LCURLY
| '}' -> JP.RCURLY
| '[' -> JP.LBRACKET
| ']' -> JP.RBRACKET
| ':' -> JP.COLON
| ',' -> JP.COMMA
| t_int -> JP.INT (int_of_string (Ulexing.utf8_lexeme lexbuf))
| t_number -> JP.FLOAT (float_of_string (Ulexing.utf8_lexeme lexbuf))
| "true" -> JP.TRUE
| "false" -> JP.FALSE
| "null" -> JP.NIL
| "undefined" -> JP.NIL
| "u" -> JP.NIL
(* An ident without quotes *)
(* This is not from JSON spec, it's added only for compatibility in OPA *)
| t_ident -> JP.IDENT (Ulexing.utf8_lexeme lexbuf)
(* Spaces *)
| t_blank -> get_token lexbuf
(* Strings *)
| '\"' -> Buffer.reset b;
get_string b lexbuf
| _ -> failwith ("unknown token: " ^ (Ulexing.utf8_lexeme lexbuf))
(** Parse a string, handle escaping *)
and get_string s = lexer
| eof -> assert false
| '\"' -> JP.STRING (Buffer.contents s)
| "\\\"" -> Buffer.add_char s '"'; get_string s lexbuf
| "\\\\" -> Buffer.add_char s '\\'; get_string s lexbuf
| "\\/" -> Buffer.add_char s '/'; get_string s lexbuf
| "\\b" -> Buffer.add_char s '\b'; get_string s lexbuf
| "\\f" -> Buffer.add_char s '\012'; get_string s lexbuf
| "\\n" -> Buffer.add_char s '\n'; get_string s lexbuf
| "\\r" -> Buffer.add_char s '\r'; get_string s lexbuf
| "\\t" -> Buffer.add_char s '\t'; get_string s lexbuf
| "\\u" t_hexa ->
let lx = Ulexing.utf8_lexeme lexbuf in
let i = int_of_string ("0x"^(String.sub lx 2 4)) in
let res = Cactutf.cons i in
Buffer.add_string s res;
get_string s lexbuf
| [^'\\''\"']+ -> Buffer.add_string s (Ulexing.utf8_lexeme lexbuf); get_string s lexbuf
| _ -> failwith "unterminated string"
(** Print token contained on given string. Used for debug. *)
let print_tokens str =
let pr = function
| JP.EOF -> "eof"
| JP.LCURLY -> "{"
| JP.RCURLY -> "}"
| JP.COLON -> ":"
| JP.LBRACKET -> "["
| JP.RBRACKET -> "]"
| JP.COMMA -> ","
| JP.TRUE -> "true"
| JP.FALSE -> "false"
| JP.NIL -> "null"
| JP.STRING s -> "\"" ^ s ^ "\""
| JP.IDENT s -> "$" ^ s ^ "$"
| JP.INT i -> string_of_int i
| JP.FLOAT f -> string_of_float f
in
let buffer = Ulexing.from_utf8_string str in
Printf.printf "ml json: %S\n\n%!" str;
let tok = ref JP.NIL in
while !tok <> JP.EOF do
tok := get_token buffer;
Printf.printf "token = %s\n" (pr !tok)
done
(** Transform a string to type that you want with given constructor.
[transform emptyM addM emptyL addL cint cfloat cstring cbool cvoid str]
@param emptyM Constructor for an empty record
@param addM Constructor for add a field to a record
@param emptyL Constructor for an empty list
@param emptyL Constructor for add an element to a list
@param cint Constructor for an int
@param cfloat Constructor for a float
@param cstring Constructor for a string
@param cbool Constructor for a bool
@param cvoid Constructor for a void
@return Constructed value
*)
let transform utf8 (*emptyM addM emptyL addL cons_int cons_float cons_string cons_bool cons_void*) str =
(*
(* convert the JSON AST to an OPA type *)
let rec conv = function
| Int i -> cons_int i
| Float f -> cons_float f
| String s -> cons_string s
| Bool b -> cons_bool b
| Void -> cons_void ()
| Array l -> List.fold_right (fun e acc -> addL (conv e) acc) l (emptyL())
| Record l -> List.fold_right (fun (s,v) acc -> addM s (conv v) acc) l (emptyM())
in
*)
(*DEBUG - print_tokens str;*)
let buffer =
if utf8 then Ulexing.from_utf8_string str
else Ulexing.from_latin1_string str
in
let res() =
(* Trick from Alain Frisch to use Ulex with OCamlyacc *)
(* http://caml.inria.fr/pub/ml-archives/caml-list/2005/01/52cbc2cd2be4fc7ea0f00c39a760bf59.en.html *)
JP.json (fun _ -> get_token buffer) (Lexing.from_string "dummy") in
try Some((*conv*) (res())) with _ -> None