Skip to content

Commit

Permalink
Improve invalid link list to include some more common unwanted links
Browse files Browse the repository at this point in the history
  • Loading branch information
Caged committed Aug 25, 2009
1 parent c4e377a commit ebc3bf4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 4 deletions.
2 changes: 1 addition & 1 deletion dragnet.gemspec
Expand Up @@ -14,7 +14,7 @@ Gem::Specification.new do |s|
s.email = %q{justin@labratrevenge.com}
s.extra_rdoc_files = [
"LICENSE",
"README.rdoc"
"README.markdown"
]
s.files = [
".document",
Expand Down
2 changes: 2 additions & 0 deletions lib/dragnet.rb
Expand Up @@ -7,6 +7,8 @@
require 'uri'
require 'mofo'

$:.unshift(File.dirname(__FILE__))

require 'dragnet/dragger'

#Dragnet::Dragger::DEBUG = true
Expand Down
54 changes: 51 additions & 3 deletions lib/dragnet/dragger.rb
Expand Up @@ -90,6 +90,51 @@ module
param
)

INVALID_LINK_HOSTS = [
'del.icio.us',
'digg.com',
'technorati.com',
'stumbleupon.com'
]

INVALID_LINK_TEXT = [
'email',
'e-mail',
'email article',
'reddit',
'retweet',
'digg',
'digg it',
'del.icio.us',
'technorati',
'stumble',
'stumbleUpon',
'myspace',
'report abuse',
'print',
'print article',
'printable version',
'permalink',
'trackbacks',
'trackback',
'read more',
'facebook',
'yahoo buzz!',
'yahoo! buzz',
'mixx',
'terms of service',
'your ad here',
'sphere it!',
'share this',
'share',
'« previous',
'next comments »',
'links to this article',
'my yahoo!',
'google reader',
'rss'
]

CONTROL_SCORE = 20

DEBUG = false
Expand Down Expand Up @@ -144,7 +189,7 @@ def parse!
# lets try another method
empty = paragraphs.collect {|c| c.content.strip}.join('').empty?
if paragraphs.size == 0 || empty
paragraphs = @doc.csss('div').to_a
paragraphs = @doc.css('div').to_a
end

paragraphs + @doc.css('blockquote').to_a
Expand Down Expand Up @@ -276,8 +321,11 @@ def extract_links_from_content(content)
if (href && !href.nil?) || (href && !href.empty?)
begin
url = URI.parse(href)
unless url.host.nil?
links << {:text => link.content, :href => href}
text = link.content.strip.downcase.gsub(/\n+/, ' ')
unless url.host.nil? ||
INVALID_LINK_HOSTS.include?(url.host.downcase.to_s) ||
INVALID_LINK_TEXT.include?(text) || text.empty?
links << {:text => link.content.strip, :href => href}
end
rescue

Expand Down

0 comments on commit ebc3bf4

Please sign in to comment.