# Web Scraping Wuzzuf

> In this notebook you will scrape wuzzuf for the job title you need and you will get some of the latest info 
> about this job title.
> 
> To get the full experience check out the __[github repo](https://github.com/AbdulrahmanYaseen/web-scrape-wuzzuf-for-a-job-title-and-get-a-dashboard-out-of-the-data)__ and download the power bi file and get a dashboard for your scraped data!
>
> In the last cell make sure your lists have the same length

### Enter required data here:

In [1]:
#enter the job you want to search for
job = "DaTa analyst"
filters = {
    'career_level': ['entry level'],
    'job_types': [],
    'post_date': '24 hours'
}
driver_path = "comp/chromedriver.exe"
file_path = "comp/Data.csv"

### Run all the following cells:

In [2]:
# import needed libraries
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from url_builder import url_gen
import lxml

In [3]:
url = url_gen(job, filters)
url

'https://wuzzuf.net/search/jobs/?a=navbl&filters%5Bcareer_level%5D%5B0%5D=Entry%20Level&filters%5Bjob_types%5D%5B0%5D=full_time&filters%5Bjob_types%5D%5B1%5D=work_from_home&filters%5Bpost_date%5D%5B0%5D=within_24_hours&q=data%20analyst&start=0'

In [4]:
# create needed lists
#these ones for the data within the jobssearch page without entering each job's link
job_title = []
company_name = []
location = []
links = []
date = []

# these ones for each job's link
Salary = []
Experience_Needed = []
Career_Level = []
Education_Level = []
Job_Categories = []
No_of_applicants = []
Job_Requirements = []

# using requeset & beautifulsoup to scrape needed information
page_num = 0

while True:
    try:
        #getting the links for each job iterating through each page
        url = url[:-1] + str(page_num)
        result = requests.get(url)
        src = result.content
        soup = BeautifulSoup(src, "lxml")
        page_limit = int(soup.find("strong").text)
        
        #the page limit is the number of jobs
        #so wewant to stop after reaching the last page
        if (page_num > page_limit // 15):
            print("Pages ended")
            break
            
        #getting needed info
        job_titles = soup.find_all("h2", {"class": "css-m604qf"})
        company_names = soup.find_all("a", {"class": "css-17s97q8"})
        location_names = soup.find_all("span", {"class": "css-5wys0k"})
        
        #posting date divs differ so
        posted_new = soup.find_all("div", {"class": "css-4c4ojb"})
        posted_old = soup.find_all("div", {"class": "css-do6t5g"})
        posted = [*posted_new, *posted_old]
        
        #append data to our lists
        for i in range(len(job_titles)):
            job_title.append(job_titles[i].text)
            links.append(job_titles[i].find("a").attrs['href'])
            company_name.append(company_names[i].text)
            location.append(location_names[i].text)
            date_text = posted[i].text.replace("_", "").strip()
            date.append(date_text)

        page_num += 1
        print("Page Switched")

    except Exception as e:
        print(e)
        break

https://wuzzuf.net/search/jobs/?a=navbl&filters%5Bcareer_level%5D%5B0%5D=Entry%20Level&filters%5Bjob_types%5D%5B0%5D=full_time&filters%5Bjob_types%5D%5B1%5D=work_from_home&filters%5Bpost_date%5D%5B0%5D=within_24_hours&q=data%20analyst&start=0
Page Switched
https://wuzzuf.net/search/jobs/?a=navbl&filters%5Bcareer_level%5D%5B0%5D=Entry%20Level&filters%5Bjob_types%5D%5B0%5D=full_time&filters%5Bjob_types%5D%5B1%5D=work_from_home&filters%5Bpost_date%5D%5B0%5D=within_24_hours&q=data%20analyst&start=1
Pages ended


In [None]:
#using selenium to iterate through each link and get needed info
driver = webdriver.Chrome(driver_path)

for link in links:
    try:
        #getting each info by it's x_path
        driver.get(link)
        
        No_applicants = driver.find_element_by_xpath('//strong[@class = "css-u1gwks"]')
        No_of_applicants.append(No_applicants.text)
        
        experiance = driver.find_element_by_xpath('//div[@class="css-rcl8e5"][1]/span[2]/span')
        Experience_Needed.append(experiance.text)
        
        careerlevel = driver.find_element_by_xpath('//div[@class="css-rcl8e5"][2]/span[2]/span')
        Career_Level.append(careerlevel.text)
        
        education = driver.find_element_by_xpath('//div[@class="css-rcl8e5"][3]/span[2]/span')
        Education_Level.append(education.text)
        
        #Job_Categories is more than one so we will loop through the elements of it's x_path
        Jobcategories_text = ""
        Jobcategories = driver.find_elements_by_xpath('//div[@class="css-13sf2ik"]/ul/li/a/span')
        for jobcat in Jobcategories:
            Jobcategories_text += jobcat.text + " | "
        Jobcategories_text  = Jobcategories_text[:-2]
        Job_Categories.append(Jobcategories_text)
        
        #Requirements is more than one so we will loop through the elements of it's x_path
        reqs_text = ""
        reqs = driver.find_elements_by_xpath('//div[@class = "css-1t5f0fr"]/ul/li')
        for req in reqs:
            reqs_text += req.text + " | "
        reqs_text = reqs_text[:-2]
        Job_Requirements.append(reqs_text)
            
        salaries = driver.find_element_by_xpath('//div[@class="css-rcl8e5"][4]/span[2]/span')
        Salary.append(salaries.text)
    except:
        Experience_Needed.append("not found")
        Career_Level.append("not found")
        Education_Level.append("not found")
        Job_Categories.append("not found")        
        Salary.append("not found")
        print("one job data not found")
    continue

In [None]:
# creating csv file


file_list = [job_title, company_name, location, date, Salary, 
             Experience_Needed, Career_Level, Education_Level, Job_Categories, links, No_of_applicants, Job_Requirements]
exported = zip_longest(*file_list)

with open(file_path, "w", encoding='utf-8') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(['job_title', 'company_name', 'location', 'date', 'Salary', 'Experience_Needed', 'Career_Level',
                 'Education_Level', 'Job_Categories', 'links', 'No_of_applicants', 'Job_Requirements'])
    wr.writerows(exported)

In [None]:
#check all lists have the same length
file_list = [job_title, company_name, location, date, Salary, 
             Experience_Needed, Career_Level, Education_Level, Job_Categories, links, No_of_applicants, Job_Requirements]
for i in (file_list):
    print(len(i))