This repository was archived by the owner on Jun 8, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
Copy pathwikipedia.coffee
93 lines (72 loc) · 2.52 KB
/
wikipedia.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Description:
# None
#
# Dependencies:
# "htmlparser": "1.7.6"
# "soupselect": "0.2.0"
# "underscore": "1.3.3"
# "underscore.string": "2.3.0"
#
# Configuration:
# None
#
# Commands:
# hubot wiki me <query> - Searches for <query> on Wikipedia.
#
# Author:
# h3h
_ = require("underscore")
_s = require("underscore.string")
Select = require("soupselect").select
HTMLParser = require "htmlparser"
module.exports = (robot) ->
robot.respond /(wiki)( me)? (.*)/i, (msg) ->
wikiMe robot, msg.match[3], (text, url) ->
msg.send text
msg.send url if url
wikiMe = (robot, query, cb) ->
articleURL = makeArticleURL(makeTitleFromQuery(query))
robot.http(articleURL)
.header('User-Agent', 'Hubot Wikipedia Script')
.get() (err, res, body) ->
return cb "Sorry, the tubes are broken." if err
if res.statusCode is 301
return cb res.headers.location
if /does not have an article/.test body
return cb "Wikipedia has no idea what you're talking about."
paragraphs = parseHTML(body, "p")
bodyText = findBestParagraph(paragraphs) or "Have a look for yourself:"
cb bodyText, articleURL
# Utility Methods
childrenOfType = (root, nodeType) ->
return [root] if root?.type is nodeType
if root?.children?.length > 0
return (childrenOfType(child, nodeType) for child in root.children)
[]
findBestParagraph = (paragraphs) ->
return null if paragraphs.length is 0
childs = _.flatten childrenOfType(paragraphs[0], 'text')
text = (textNode.data for textNode in childs).join ''
# remove parentheticals (even nested ones)
text = text.replace(/\s*\([^()]*?\)/g, '').replace(/\s*\([^()]*?\)/g, '')
text = text.replace(/\s{2,}/g, ' ') # squash whitespace
text = text.replace(/\[[\d\s]+\]/g, '') # remove citations
text = _s.unescapeHTML(text) # get rid of nasties
# if non-letters are the majority in the paragraph, skip it
if text.replace(/[^a-zA-Z]/g, '').length < 35
findBestParagraph(paragraphs.slice(1))
else
text
makeArticleURL = (title) ->
"https://en.wikipedia.org/wiki/#{encodeURIComponent(title)}"
makeTitleFromQuery = (query) ->
strCapitalize(_s.trim(query).replace(/[ ]/g, '_'))
parseHTML = (html, selector) ->
handler = new HTMLParser.DefaultHandler((() ->),
ignoreWhitespace: true
)
parser = new HTMLParser.Parser handler
parser.parseComplete html
Select handler.dom, selector
strCapitalize = (str) ->
return str.charAt(0).toUpperCase() + str.substring(1);