-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_document.rb
306 lines (288 loc) · 8.73 KB
/
clean_document.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# frozen_string_literal: true
module SyntaxSuggest
# Parses and sanitizes source into a lexically aware document
#
# Internally the document is represented by an array with each
# index containing a CodeLine correlating to a line from the source code.
#
# There are three main phases in the algorithm:
#
# 1. Sanitize/format input source
# 2. Search for invalid blocks
# 3. Format invalid blocks into something meaninful
#
# This class handles the first part.
#
# The reason this class exists is to format input source
# for better/easier/cleaner exploration.
#
# The CodeSearch class operates at the line level so
# we must be careful to not introduce lines that look
# valid by themselves, but when removed will trigger syntax errors
# or strange behavior.
#
# ## Join Trailing slashes
#
# Code with a trailing slash is logically treated as a single line:
#
# 1 it "code can be split" \
# 2 "across multiple lines" do
#
# In this case removing line 2 would add a syntax error. We get around
# this by internally joining the two lines into a single "line" object
#
# ## Logically Consecutive lines
#
# Code that can be broken over multiple
# lines such as method calls are on different lines:
#
# 1 User.
# 2 where(name: "schneems").
# 3 first
#
# Removing line 2 can introduce a syntax error. To fix this, all lines
# are joined into one.
#
# ## Heredocs
#
# A heredoc is an way of defining a multi-line string. They can cause many
# problems. If left as a single line, the parser would try to parse the contents
# as ruby code rather than as a string. Even without this problem, we still
# hit an issue with indentation:
#
# 1 foo = <<~HEREDOC
# 2 "Be yourself; everyone else is already taken.""
# 3 ― Oscar Wilde
# 4 puts "I look like ruby code" # but i'm still a heredoc
# 5 HEREDOC
#
# If we didn't join these lines then our algorithm would think that line 4
# is separate from the rest, has a higher indentation, then look at it first
# and remove it.
#
# If the code evaluates line 5 by itself it will think line 5 is a constant,
# remove it, and introduce a syntax errror.
#
# All of these problems are fixed by joining the whole heredoc into a single
# line.
#
# ## Comments and whitespace
#
# Comments can throw off the way the lexer tells us that the line
# logically belongs with the next line. This is valid ruby but
# results in a different lex output than before:
#
# 1 User.
# 2 where(name: "schneems").
# 3 # Comment here
# 4 first
#
# To handle this we can replace comment lines with empty lines
# and then re-lex the source. This removal and re-lexing preserves
# line index and document size, but generates an easier to work with
# document.
#
class CleanDocument
def initialize(source:)
lines = clean_sweep(source: source)
@document = CodeLine.from_source(lines.join, lines: lines)
end
# Call all of the document "cleaners"
# and return self
def call
join_trailing_slash!
join_consecutive!
join_heredoc!
self
end
# Return an array of CodeLines in the
# document
def lines
@document
end
# Renders the document back to a string
def to_s
@document.join
end
# Remove comments
#
# replace with empty newlines
#
# source = <<~'EOM'
# # Comment 1
# puts "hello"
# # Comment 2
# puts "world"
# EOM
#
# lines = CleanDocument.new(source: source).lines
# expect(lines[0].to_s).to eq("\n")
# expect(lines[1].to_s).to eq("puts "hello")
# expect(lines[2].to_s).to eq("\n")
# expect(lines[3].to_s).to eq("puts "world")
#
# Important: This must be done before lexing.
#
# After this change is made, we lex the document because
# removing comments can change how the doc is parsed.
#
# For example:
#
# values = LexAll.new(source: <<~EOM))
# User.
# # comment
# where(name: 'schneems')
# EOM
# expect(
# values.count {|v| v.type == :on_ignored_nl}
# ).to eq(1)
#
# After the comment is removed:
#
# values = LexAll.new(source: <<~EOM))
# User.
#
# where(name: 'schneems')
# EOM
# expect(
# values.count {|v| v.type == :on_ignored_nl}
# ).to eq(2)
#
def clean_sweep(source:)
# Match comments, but not HEREDOC strings with #{variable} interpolation
# https://rubular.com/r/HPwtW9OYxKUHXQ
source.lines.map do |line|
if line.match?(/^\s*#([^{].*|)$/)
$/
else
line
end
end
end
# Smushes all heredoc lines into one line
#
# source = <<~'EOM'
# foo = <<~HEREDOC
# lol
# hehehe
# HEREDOC
# EOM
#
# lines = CleanDocument.new(source: source).join_heredoc!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
def join_heredoc!
start_index_stack = []
heredoc_beg_end_index = []
lines.each do |line|
line.lex.each do |lex_value|
case lex_value.type
when :on_heredoc_beg
start_index_stack << line.index
when :on_heredoc_end
start_index = start_index_stack.pop
end_index = line.index
heredoc_beg_end_index << [start_index, end_index]
end
end
end
heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }
join_groups(heredoc_groups)
self
end
# Smushes logically "consecutive" lines
#
# source = <<~'EOM'
# User.
# where(name: 'schneems').
# first
# EOM
#
# lines = CleanDocument.new(source: source).join_consecutive!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
#
# The one known case this doesn't handle is:
#
# Ripper.lex <<~EOM
# a &&
# b ||
# c
# EOM
#
# For some reason this introduces `on_ignore_newline` but with BEG type
#
def join_consecutive!
consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
take_while_including(code_line.index..) do |line|
line.ignore_newline_not_beg?
end
end
join_groups(consecutive_groups)
self
end
# Join lines with a trailing slash
#
# source = <<~'EOM'
# it "code can be split" \
# "across multiple lines" do
# EOM
#
# lines = CleanDocument.new(source: source).join_consecutive!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
def join_trailing_slash!
trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
take_while_including(code_line.index..) { |x| x.trailing_slash? }
end
join_groups(trailing_groups)
self
end
# Helper method for joining "groups" of lines
#
# Input is expected to be type Array<Array<CodeLine>>
#
# The outer array holds the various "groups" while the
# inner array holds code lines.
#
# All code lines are "joined" into the first line in
# their group.
#
# To preserve document size, empty lines are placed
# in the place of the lines that were "joined"
def join_groups(groups)
groups.each do |lines|
line = lines.first
# Handle the case of multiple groups in a row
# if one is already replaced, move on
next if @document[line.index].empty?
# Join group into the first line
@document[line.index] = CodeLine.new(
lex: lines.map(&:lex).flatten,
line: lines.join,
index: line.index
)
# Hide the rest of the lines
lines[1..].each do |line|
# The above lines already have newlines in them, if add more
# then there will be double newline, use an empty line instead
@document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])
end
end
self
end
# Helper method for grabbing elements from document
#
# Like `take_while` except when it stops
# iterating, it also returns the line
# that caused it to stop
def take_while_including(range = 0..)
take_next_and_stop = false
@document[range].take_while do |line|
next if take_next_and_stop
take_next_and_stop = !(yield line)
true
end
end
end
end