forked from jiffyclub/regexmagic
/
regexmagic.py
252 lines (196 loc) · 8.63 KB
/
regexmagic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""regexmagic provides several line and cell magics for the IPython
Notebook, which run regular expressions against lines of text without
the clutter of re.search(...) calls. The output is colorized to show
the span of each match.
Available cell magics:
matchlines
imatchlines
Available line magics:
matchfile
imatchfile
Example usage:
%%matchlines a+b
this text has no matches
this line has one match: aaab
about to match some more: aab
Note: IPython presently interprets {x} to mean 'expand variable x', so
regular expressions like '\d{4}' must be written as '\d{{4}}'.
We're working on it...
"""
# This file is copyright 2013 by Matt Davis and Greg Wilson and
# covered by the license at
# https://github.com/gvwilson/regexmagic/blob/master/LICENSE
import re
from IPython.core.magic import Magics, magics_class, line_magic, cell_magic
from IPython.display import display, Javascript, HTML, clear_output
import IPython.html.widgets as widgets
# formatting templates
PATTERN_TEMPL = '<span style="font-weight: bold">Pattern:</span> <span style="color:DarkGreen; font-weight:bold; font-family: monospace; background-color: lightgray; white-space: pre;">{0}</span>'
ERROR_TEMPL = '<span style="font-weight: bold">Invalid regex:</span> <span style="color:Red; font-weight:bold; font-family: monospace; background-color: lightgray; white-space: pre;">{0}</span>'
OPTS_TEMPL = '<span style="font-weight: bold">Options:</span> <span style="font-style:italic;">{0}</span>'
MATCH_TEMPL = '<span style="background:{0}; font-weight:bold; white-space: pre;">{1}</span>'
NOMATCH_TEMPL = '<span style="color:gray; white-space: pre;">{0}</span>'
@magics_class
class RegexMagic(Magics):
'''Provide the calling points for the magic, and keep track of
alternating colors while matching.
'''
Colors = ['Pink', 'Yellow']
@line_magic
def imatchfile(self, line, cell=None):
'''Perform regular expression matching on the given file, using an
iteractively provided pattern.
Usage: %imatchfile <filename>
See the `%%imatchlines` cell magic for further documentation
and a description of available options.
'''
filename = line.strip()
with open(filename, 'r') as reader:
text = reader.read()
return self.imatchlines('', text)
@line_magic
def matchfile(self, line, cell=None):
'''Perform regular expression matching on the given file using the
given pattern.
Usage: %matchfile <filename> <pattern>
See the `%%matchlines` cell magic for further documentation.
'''
filename, pattern = line.split(' ', 1)
filename = filename.strip()
with open(filename, 'r') as reader:
text = reader.read()
return self.matchlines(pattern, text)
@cell_magic
def imatchlines(self, line, cell):
'''Perform regular expression matching on the contents of the cell,
using an interactively provided regular expression.
Usage: %%imatchlines
There are several options you can interactively specify:
ignore_case: Perform case-insensitive matching;
expressions like [A-Z] will match lowercase letters, too.
multiline: When specified, the pattern character '^' matches
at the beginning of the string and at the beginning of each
line (immediately following each newline); and the pattern
character '$' matches at the end of the string and at the end
of each line (immediately preceding each newline). By default,
'^' matches only at the beginning of the string, and '$' only
at the end of the string and immediately before the newline
(if any) at the end of the string.
dotall: Make the '.' special character match any character at
all, including a newline; without this flag, '.' will match
anything except a newline.
'''
# create a widget for the pattern text box
pattern = widgets.TextWidget(
description="Pattern:", value='')
display(pattern)
# create widgets for the options
ignore_case = widgets.CheckboxWidget(
description="Ignore case?", value=False)
multiline = widgets.CheckboxWidget(
description="Match lines separately?", value=False)
dot_all = widgets.CheckboxWidget(
description="Dot matches all?", value=False)
options = [ignore_case, multiline, dot_all]
# create a container widget for the options
container = widgets.ContainerWidget()
display(container)
container.remove_class('vbox')
container.add_class('hbox')
# we need to wrap the options in individual container widgets
# so that we can set the padding between them
children = []
for opt in options:
wrapper = widgets.ContainerWidget(children=[opt])
wrapper.set_css({'padding-right': '2em'})
children.append(wrapper)
container.children = children
# these are the widgets whose values we want to watch
watched = {
'pattern': pattern,
'ignore_case': ignore_case,
'multiline': multiline,
'dot_all': dot_all
}
# update the output as the values are changed
def update(*args):
kwargs = {'text': cell}
for name, widget in watched.iteritems():
kwargs[name] = widget.value
clear_output(wait=True)
self.display_regex(**kwargs)
for name, widget in watched.iteritems():
widget.on_trait_change(update, 'value')
update()
@cell_magic
def matchlines(self, line, cell):
'''Perform regular expression matching on the cell contents using the
given pattern.
Usage: %matchlines <pattern>
'''
self.display_regex(pattern=line, text=cell)
def display_regex(self, pattern='', text='',
ignore_case=False, multiline=False, dot_all=False):
'''Compile the regular expression with the specified options, search
for matches, and format them appropriately.
'''
# compile the regular expression, with flags
flags = 0
flag_strs = []
if ignore_case:
flags = flags | re.IGNORECASE
flag_strs.append("ignore case")
if multiline:
flags = flags | re.MULTILINE
flag_strs.append("match lines separately")
if dot_all:
flags = flags | re.DOTALL
flag_strs.append("dot matches all")
try:
compiled_pattern = re.compile(pattern, flags)
# handle the case where the regular expression is invalid
except:
result_str = NOMATCH_TEMPL.format(text).split('\n')
pattern_str = ERROR_TEMPL.format(pattern)
# handle the case where the regular expression is ok
else:
# this keeps track of the colors for alternating matches
self.this_color, self.next_color = RegexMagic.Colors
pattern_str = PATTERN_TEMPL.format(pattern)
result_str = self.match(compiled_pattern, text).split('\n')
# display what options were picked
if len(flag_strs) > 0:
flag_str = OPTS_TEMPL.format(", ".join(flag_strs))
lines = [pattern_str, flag_str, ''] + result_str
else:
lines = [pattern_str, ''] + result_str
html_disp = HTML('<br/>'.join(lines))
display(html_disp)
return html_disp
def match(self, compiled_pattern, text):
'''Search for regular expression matches using the given compiled
pattern.
'''
result = []
m = compiled_pattern.search(text)
while m:
start = m.start()
end = m.end()
# if the match is zero length, stop searching
if start == end:
break
# format all text up to the current match
result.append(NOMATCH_TEMPL.format(text[:start]))
# format the current match
result.append(MATCH_TEMPL.format(self.this_color, text[start:end]))
# search for the next match
text = text[end:]
self.this_color, self.next_color = self.next_color, self.this_color
m = compiled_pattern.search(text)
if len(text) > 0:
result.append(NOMATCH_TEMPL.format(text))
return ''.join(result)
def load_ipython_extension(ipython):
ipython.register_magics(RegexMagic)
# display cells as text, not python
display(Javascript("IPython.config.cell_magic_highlight.magic_text = {'reg': [/^%{1,3}i?match/]}"))