In [21]:
import requests
import requests_cache
import numpy as np
import pandas as pd
import lxml.html as lxl
import nltk
import re
from nltk.corpus import stopwords
from datetime import datetime

In [22]:
requests_cache.install_cache('demo_cache')

In [23]:
def urls_scraping(base_url, search_term, pages, location=''):
    urls = []
    
    for page in range(pages):
        try:
            response = requests.get(base_url, params={'searchterms': search_term,
                                                      'searchlocation': location,
                                                      'page': page+1})
            response.raise_for_status()
        except:
            break
    
        root = lxl.fromstring(response.content)
        url = ['https://www.cybercoders.com' + link for link in root.xpath('//div[@class="job-title"]/a/@href')]
        urls += url
    
    return urls
    
    
def page_scraping(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except:
        return
    
    root = lxl.fromstring(response.content)
    # extract title
    title = root.xpath('//div[@class="job-title"]/h1/text()')[0].strip()     
    
    # extract city and location of that posting.
    location = [i.strip() for i in root.xpath('//div[@class="location"]/span/text()')[0].split(',')]
    city = location[0]
    state = location[1]
    
    # extract job type and salary(to be done)
    try:
        salary = [int(i.replace('k', '000')) for i in re.findall(r'\d+k',root.xpath('//div[@class="wage"]/span/text()')[0] ) ] 
        if salary == []:
            salary_lower = 'NA'
            salary_upper = 'NA'
        else:
            salary_lower = salary[0]
            salary_upper = salary[1]
    except:
        salary_lower = 'Glitch'
        salary_upper = 'Glitch'
    # extract preferred skills(string type)
    skill_ls = root.xpath('//div[@class="skills"]/ul[@class="skill-list"]                         \
           /li[@class="skill-item"]/a/span[@class="skill-name"]/text()')
    preferred_skill = ', '.join([skill.strip() for skill in skill_ls])
    
    # extract 'what you need for this position'
    need_for_position = root.xpath('//div[@class="section-data section-data-title" and @data-section="7"]/text()')
    need_for_position = ' '.join(need_for_position)
    #extract job id
    job_id = root.xpath('//div[@class="job-id"]/text()')[0].strip()
    job_id = re.sub('.*: ','', job_id)  
    
    #extract post date
    post_date = root.xpath('//div[@class="mobile-hide posted-today posted-text" \
    or @class= "mobile-hide posted posted-text"]/span/text()')[0].strip()
 
    
    if post_date == 'Posted Today':
        post_date = datetime.today().strftime('%m/%d/%Y')
    else:
        post_date = re.sub(r'^Posted ([0-9]{2}/[0-9]{2}/[0-9]{4})', r'\1', post_date)
    
    return pd.Series({'city':city, 'state': state, 'salary_lower': salary_lower, 'salary_upper': salary_upper, 'preferred_skill':preferred_skill, 'need_for_position': need_for_position,
           'job_id': job_id, 'post_date': post_date})

def scraping(base_url, search_term, pages, location=''):
    urls = urls_scraping(base_url, search_term, pages, location='')
    scraping_data = [page_scraping(link) for link in urls]
    #scraping_data = []
    #for link in urls:
    #    print link
    #    scraping_data.append(page_scraping(link))
    return pd.concat(scraping_data, axis=1).T

In [24]:
Software_Engineer = scraping('https://www.cybercoders.com/search/', 'Software Engineer', 6)

In [25]:
Software_Engineer.head()
Software_Engineer.to_csv('Software_EngineerTest.csv',encoding="utf-8")

Just for debugging

In [12]:
response = requests.get('https://www.cybercoders.com/data-scientist-job-267769')
response.raise_for_status()
root = lxl.fromstring(response.content)

In [13]:
root.xpath('//div[@class="mobile-hide posted-today posted-text"]/span/text()')[0]

IndexError: list index out of range

In [None]:
a = root.xpath('//div[@class="mobile-hide posted-today posted-text" or @class= "mobile-hide posted posted-text"]/span/text()')[0].strip()

In [None]:
a

In [26]:
#pd.concat([pd.Series(a), pd.Series(b), pd.Series(a)], axis=1).T
#urls_scraping(base_url, search_term, pages, location='')
Software_Engineer.head(40)

Unnamed: 0,city,state,salary_lower,salary_upper,preferred_skill,need_for_position,job_id,post_date
0,Ann Arbor,MI • Houston,110000,180000,".NET, C#, REACT, React.JS, Angular.js, Senior ...",- .NET - C# \n- REACT \n- React.JS \n- Angular...,AG17-1674760,02/14/2022
1,San Francisco,CA • Palo Alto,140000,190000,"React.JS, Node.js, Senior Software Engineer, S...",- React.JS - Node.js \n- Senior Software Engin...,AG17-1674756,02/14/2022
2,Seattle,WA • New York,180000,225000,"Ruby On Rails, ROR, REACT, Senior Software Dev...",At least 10 years experience and B.S. in C.S. ...,AG17-1671934,01/27/2022
3,Boston,MA,100000,140000,"Software Engineer, Ruby On Rails, REACT",- 3+ years developing software in commercial e...,LP10-1660041,12/07/2021
4,Santa Barbara,CA,100000,130000,"Software Engineer, Ruby On Rails, REACT",- 3+ years developing software in commercial e...,LP10-1655928,11/03/2021
5,Lone Tree,CO,120000,175000,"Software Engineer, C++, C#",- 3+ Years Experience as a Software Engineer -...,AM20-1667110,12/21/2021
6,Orange,CA,160000,200000,"C++, Java, TCP/IP, Cisco, securex, ForeScout, ...",Required: -7+ Years of development experience ...,DF7-1672354,01/31/2022
7,San Jose,CA,160000,200000,"C++, Java, TCP/IP, Cisco, securex, ForeScout, ...",Required: -7+ Years of development experience ...,DF7-1672362,02/04/2022
8,San Francisco,CA,160000,200000,"C++, Java, TCP/IP, Cisco, securex, ForeScout, ...",Required: -7+ Years of development experience ...,DF7-1672364,01/31/2022
9,Bozeman,MT,110000,130000,"Devops, Engineer, Software, Technologies",5-10+ years of experience Strong interpersona...,VP5-1671111,01/29/2022
