From 71355e3a3225ec55164e8df38a0edfb4562a1455 Mon Sep 17 00:00:00 2001 From: Kamran Azmat Date: Tue, 25 Jun 2024 18:19:33 +0500 Subject: [PATCH 1/2] Setup Workable XML Importer & Amazon S3 Integration --- app/services/importer/xml/workable_kamran.rb | 112 +++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 app/services/importer/xml/workable_kamran.rb diff --git a/app/services/importer/xml/workable_kamran.rb b/app/services/importer/xml/workable_kamran.rb new file mode 100644 index 000000000..25a051559 --- /dev/null +++ b/app/services/importer/xml/workable_kamran.rb @@ -0,0 +1,112 @@ +module Importer + module Xml + # Docs: https://help.workable.com/hc/en-us/articles/4420464031767-Utilizing-the-XML-Job-Feed + # To use this: rake xml:workable + # NB. the Workable XML feed is sensitive to 403 Forbidden errors (too many requests) + # How this works: The feed includes urls of the format: https://apply.workable.com/j/9A5B371BA0 (i.e. without the ats_identifier) + # In order to handle this, we create a variable with all the job_posting_urls from the XML feed + # We then resolve the redirect to get the final url and build the company / job posting from the API via CreateJobFromUrl, this bypasses rate limiting problems and uses standard functionality + require 'faraday' + require 'nokogiri' + require 'aws-sdk-s3' + require 'json' + require 'logger' + class Workable < ApplicationService + # TODO: Fix this as only capturing a portion of the redirect urls at the moment. There is a wait screen page that is sometimes coming up and it may require a proxy to process. Check for the interim response code. Setup a proxy and use a different IP address to check the page. + # TODO: Add count for number of urls in XML feed and number of urls processed so that we can evaluate success rate + + # TODO: This can be made more efficient in future as lots of the job_posting_urls will already exist in the database after the first run + # TODO: What we can do is extract the ats_identifiers from the job_posting_urls and then exclude any for which the company already exists in the DB (as this will have a separate process) + + WORKABLE_XML_URL = 'https://apply.workable.com/feed/xml'.freeze + S3_BUCKET = 'your-s3-bucket'.freeze + S3_KEY = 'workable_feed.xml'.freeze + S3_REGION = 'region'.freeze + + def initialize + @ats = ApplicantTrackingSystem.find_by(name: 'Workable') + @connection = Faraday.new do |conn| + conn.request :retry, max: 5, interval: 0.05, backoff_factor: 2 + conn.adapter Faraday.default_adapter + end + @s3_client = Aws::S3::Client.new(region: S3_REGION) + end + + def import + response = @connection.get(WORKABLE_XML_URL) + if response.status == 200 + xml_file_path = store_xml_locally(response.body) + upload_to_s3(xml_file_path) + urls = parse_xml(response.body) + redirected_urls = resolve_redirects(urls) + save_redirected_urls_locally(redirected_urls) + submit_urls(redirected_urls) + else + handle_error(response) + end + end + + private + + def store_xml_locally(xml_content) + file_path = 'tmp/workable_feed.xml' + File.open(file_path, 'wb') { |file| file.write(xml_content) } + file_path + end + + def upload_to_s3(file_path) + @s3_client.put_object(bucket: S3_BUCKET, key: S3_KEY, body: File.open(file_path)) + end + + def parse_xml(xml_content) + urls = [] + Nokogiri::XML::Reader(xml_content).each do |node| + if node.name == 'url' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT + urls << node.inner_xml + end + end + urls + end + + def resolve_redirects(urls) + urls.map do |url| + response = follow_redirects(url) + response.headers['location'] || url + end + end + + def follow_redirects(url) + connection = Faraday.new(url: url) do |conn| + conn.adapter Faraday.default_adapter + end + connection.get + end + + def save_redirected_urls_locally(urls) + File.open('tmp/redirected_urls.json', 'w') do |file| + file.write(urls.to_json) + end + end + + def submit_urls(urls) + urls.each do |url| + URL::CreateJobFromUrl.call(url) + end + end + + def handle_error(response) + case response.status + when 403 + @logger.error('Too many requests. Please try again later.') + else + @logger.error("Error: #{response.status}") + end + end + + end + end + end + + # The below is for testing the redirect functionality without hitting the XML feed (comment out the above and use this) + # job_urls = ["https://apply.workable.com/j/9A5B371BA0", "https://apply.workable.com/j/2595A6EDD0", "https://apply.workable.com/j/2DA044E84E", "https://apply.workable.com/j/5ECBE14794", "https://apply.workable.com/j/F44ED9E40A", "https://apply.workable.com/j/0908074E5D", "https://apply.workable.com/j/0AA38013AC", "https://apply.workable.com/j/8D801CE251", "https://apply.workable.com/j/E91B0F3F46", "https://apply.workable.com/j/802966A156"] + \ No newline at end of file From 76b91b517f2af1759f454eb90346c5a06e4c4d6e Mon Sep 17 00:00:00 2001 From: Kamran Azmat Date: Tue, 25 Jun 2024 18:55:17 +0500 Subject: [PATCH 2/2] fix indentation --- app/services/importer/xml/workable_kamran.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/services/importer/xml/workable_kamran.rb b/app/services/importer/xml/workable_kamran.rb index 25a051559..8b68528ac 100644 --- a/app/services/importer/xml/workable_kamran.rb +++ b/app/services/importer/xml/workable_kamran.rb @@ -105,8 +105,8 @@ def handle_error(response) end end - end +end - # The below is for testing the redirect functionality without hitting the XML feed (comment out the above and use this) - # job_urls = ["https://apply.workable.com/j/9A5B371BA0", "https://apply.workable.com/j/2595A6EDD0", "https://apply.workable.com/j/2DA044E84E", "https://apply.workable.com/j/5ECBE14794", "https://apply.workable.com/j/F44ED9E40A", "https://apply.workable.com/j/0908074E5D", "https://apply.workable.com/j/0AA38013AC", "https://apply.workable.com/j/8D801CE251", "https://apply.workable.com/j/E91B0F3F46", "https://apply.workable.com/j/802966A156"] +# The below is for testing the redirect functionality without hitting the XML feed (comment out the above and use this) +# job_urls = ["https://apply.workable.com/j/9A5B371BA0", "https://apply.workable.com/j/2595A6EDD0", "https://apply.workable.com/j/2DA044E84E", "https://apply.workable.com/j/5ECBE14794", "https://apply.workable.com/j/F44ED9E40A", "https://apply.workable.com/j/0908074E5D", "https://apply.workable.com/j/0AA38013AC", "https://apply.workable.com/j/8D801CE251", "https://apply.workable.com/j/E91B0F3F46", "https://apply.workable.com/j/802966A156"] \ No newline at end of file