Skip to content

Commit

Permalink
fix: check for empty or malformed urls
Browse files Browse the repository at this point in the history
  • Loading branch information
matt-manes committed Jul 9, 2024
1 parent 0710b3b commit 2da57d9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
35 changes: 26 additions & 9 deletions lib/crawlers/company_list_crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

module Crawlers
class CompanyListCrawler
OUTPUT_PATH = File.join(Rails.root, "storage/csv/crawl_list_output_#{Process.clock_gettime(Process::CLOCK_MONOTONIC).to_i}.csv")
OUTPUT_PATH = File.join(Rails.root, "storage/csv/crawl_companies_output_#{Time.now.strftime('%d_%m_%Y_%k_%M')}.csv")

private

Expand All @@ -22,6 +22,16 @@ def dump_result(result_row)

public

# Check if the url has valid syntax
#
# @param url [String]
#
# @return [TrueClass, FalseClass]
def valid_url?(url)
url_regex = Regexp.new('https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)')
return url_regex.match?(url)
end

# Crawl company websites from the list for ats boards.
#
# `starting_offset` is 0-indexed, so if looking at the csv file,
Expand All @@ -31,20 +41,27 @@ def dump_result(result_row)
#
# @param number_of_crawls [Integer]
def crawl_list(starting_offset = 0, number_of_crawls = nil)
# Adjustable crawl paramaters ==========================================
max_crawl = 50
max_time = 10
max_hits = 1
# ======================================================================
company_list_path = File.join(Rails.root, "storage/csv/companies.csv")
FileUtils.touch(OUTPUT_PATH)

endex = number_of_crawls.nil? ? nil : starting_offset + number_of_crawls - 1
data = CSV.parse(File.read(company_list_path), headers: true)[starting_offset..endex]
max_crawl = 50
max_time = 10
max_hits = 1
data.each do |row|
crawler = Crawlers::CompanyCrawler.new(row["Website"])
crawler.set_limits(max_crawl, max_time, max_hits)
results = crawler.crawl
hits = results.join("|")
puts "Found #{results.length} hits."
if valid_url?(row["Website"])
crawler = Crawlers::CompanyCrawler.new(row["Website"])
crawler.set_limits(max_crawl, max_time, max_hits)
results = crawler.crawl
hits = results.empty? ? nil : results.join("|")
puts "Found #{results.length} hits."
else
puts "`#{row['Website']}` is not a valid url."
hits = nil
end
row<<{ "Hits"=>hits }
dump_result(row)
end
Expand Down
2 changes: 1 addition & 1 deletion lib/tasks/crawl_companies.rake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace :crawl_companies do
desc "Crawl companies list for ats boards."
task crawl: :environment do
Crawlers::CompanyListCrawler.new.crawl_list
Crawlers::CompanyListCrawler.new.crawl_list(0, nil)
end
end

0 comments on commit 2da57d9

Please sign in to comment.