In [None]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys 
import pandas as pd
from tqdm import tqdm
import time
import requests
from bs4 import BeautifulSoup

url= "https://www.hellowork.com/fr-fr/"

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Navigate to the page
driver.get(url)

# Select the search bar
search_input = driver.find_element(by=By.NAME,value="k")

# Search for a keyword
search_input.send_keys("data")

time.sleep(2)

# Press Enter
search_input.send_keys(Keys.RETURN)

# Wait for some time
time.sleep(5)

# Define XPath expression for the "Continue without accepting" button
continue_button_xpath = '//button[@id="hw-cc-notice-continue-without-accepting-btn"]'

continue_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, continue_button_xpath))
    )

# Click on the "Continue without accepting" button
continue_button.click()

# Define XPath expression for the "li" element
xpath_expression = '//li[@class="next"]'

# Wait for the element to be clickable
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, xpath_expression))
)

# Click on the element
element.click()
time.sleep(5)

def page_link():

    # To store the links
    link= []

    css_expression = '.offer--content .offer--maininfo h3 a'

    # Wait for elements to be present before retrieving them
    job_links = WebDriverWait(driver, 1).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, css_expression))
    )
    
    for job_link in job_links:
        offer_link = job_link.get_attribute('href')
        link.append(offer_link)
    
    return link

def change_page():

    xpath_expression = '//li[@class="next"]'

    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, xpath_expression))
    )

    element.click()
    time.sleep(2)

def get_all_link(nb_page):

    link=[]

    for _ in tqdm(range(1,nb_page), desc='Getting link on page'):

        link.extend(page_link())
        # Move to the next page
        change_page()
    
    return link

link_data = get_all_link(50)

driver.quit()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_job(url):

    response = requests.get(url, headers=headers)

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    # Use the find method to select the desired span element

    # Job title
    job_title_span = soup.find('span', {'class': 'tw-block tw-typo-xl sm:tw-typo-3xl tw-mb-2', 'data-cy': 'jobTitle'})
    if job_title_span is not None:
        job_title = job_title_span.text.strip()
    else:
        job_title = None

    # Company name
    company_name_span = soup.find('span', {'class': 'tw-contents tw-typo-m tw-text-grey'})
    if company_name_span is not None:
        company_name = company_name_span.text.strip()
    else:
        company_name = None

    # Location and job type
    spans = soup.find_all('span', {'class': 'tw-inline-flex tw-typo-m tw-text-grey'})
    if len(spans) >= 2:
        job_type_span = spans[1].text.strip()
        location_span = spans[0].text.strip()
    else:
        job_type_span = None
        location_span = None

    # Salary
    salary_span = soup.find('li', {'class': 'tw-tag-attractive-s tw-readonly'})
    if salary_span is not None:
        salary = salary_span.text.strip().replace('\u202f', '')
    else:
        salary = None

    # Publication date
    date_span = soup.find('span', {'class': 'tw-block tw-typo-xs tw-text-grey tw-mt-3 tw-break-words'})
    if date_span is not None:
        date_text = date_span.text.strip().split(' ')
        date = date_text[2]
        ref = date_text[6].split('/')[0]
    else:
        date = None
        ref = None

    # Advertisement reference
    ref_span = soup.find('span', {'class': 'tw-block tw-typo-xs tw-text-grey tw-mt-3 tw-break-words'})
    if ref_span is not None:
        ref_text = ref_span.text.strip().split(' ')
        ref = ref_text[6].split('/')[0]
    else:
        ref = None

    # Find the <p> element by its class name
    paragraph_element = soup.find('p', class_='tw-typo-long-m')

    # Extract text from the <p> element
    paragraph_text = paragraph_element.get_text(strip=True)

    # Extract text from the <p> element
    if paragraph_element is not None:
        paragraph_text = paragraph_element.get_text(strip=True)
    else:
        paragraph_text = None
    
    return job_title, company_name,salary,location_span, paragraph_text,date,job_type_span, ref
        

def get_infos(data_link):

    # Initialize lists to store individual pieces of information
    job_titles = []
    company_names = []
    job_types = []
    salaries = []
    locations = []
    job_descriptions = []
    Date=[]
    ID= []

    # Iterate through each URL in the provided list
    for url in tqdm(data_link, desc='processing link'):
        # Call the get_job function to extract information from the current URL
        infos = get_job(url)

        # Append the extracted information to the respective lists
        job_titles.append(infos[0])
        company_names.append(infos[1])
        job_types.append(infos[6])
        salaries.append(infos[2])
        locations.append(infos[3])
        job_descriptions.append(infos[4])
        Date.append(infos[5])
        ID.append(infos[7])
        
    # Create a dictionary with the collected information
    data = {
        'Job ID':ID,
        'Job Title': job_titles,
        'Company Name': company_names,
        'Job Type': job_types,
        'Date':Date,
        'Salary': salaries,
        'Location': locations,
        'Job Description': job_descriptions,
        'lien':link_data
    }

    # Convert the dictionary into a DataFrame using pandas
    df = pd.DataFrame(data)

    # Return the DataFrame
    return df

df= get_infos(link_data)

df.to_csv(r'C:\Users\komla\Desktop\Projet_Scraping\hellowork_data.csv', index=False, encoding='utf-8')
