In [3]:
import math
from bs4 import BeautifulSoup
import requests
import csv
import time
import random

# Function to fetch job description, type, and misc details from the respective job link
def fetch_job_details(job_url, headers):
    try:
        # Make a request to the job's detailed page
        response = requests.get(job_url, headers=headers)
        if response.status_code == 200:
            job_soup = BeautifulSoup(response.content, 'html.parser')
            
            # Initialize empty fields for description, type, and misc
            description = ""
            job_type = ""
            misc = ""

            # Extract content from the relevant sections (con_11, con_4, con_12, con_5, con_15 for description)
            for section in ['con_11', 'con_4', 'con_12', 'con_5', 'con_15']:
                content = job_soup.find("div", id=section)
                if content:
                    description += content.get_text(strip=True) + "\n"
            
            # Extract job type from con_8
            job_type_content = job_soup.find("div", id="con_8")
            if job_type_content:
                job_type = job_type_content.get_text(strip=True)
            else:
                job_type = "No job type available"
            
            # Extract misc from con_7
            misc_content = job_soup.find("div", id="con_7")
            if misc_content:
                misc = misc_content.get_text(strip=True)
            else:
                misc = "No additional information available"
            
            return description.strip() if description else "No detailed description available", job_type, misc
        else:
            print(f"Failed to load job details from {job_url}, Status code: {response.status_code}")
            return "Failed to load job details", "Failed to load job type", "Failed to load misc"
    except Exception as e:
        print(f"Error fetching job details from {job_url}: {e}")
        return f"Error fetching description: {e}", "Error fetching job type", "Error fetching misc"

# Function to fetch all jobs from a single page
def fetch_jobs_from_page(page_url, headers):
    response = requests.get(page_url, headers=headers)
    if response.status_code == 200:
        page_soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all job listings on the page (adjust the class based on the actual structure)
        job_listings = page_soup.find_all("div", class_=["line_0", "line_1"])  # Find both line_0 and line_1
        
        # Initialize a list to store jobs from the current page
        jobs = []
        
        # Loop through each job listing and extract job details
        for job in job_listings:
            # Extract job title
            job_title = job.find("span", class_="headerlink stellenlink").get_text(strip=True)
            
            # Extract application link
            onclick_text = job['onclick']
            job_link = onclick_text.split("'")[1]  # Extract the URL from the onclick attribute
            
            # Fetch job details (description, type, misc)
            job_description, job_type, job_misc = fetch_job_details(job_link, headers)
            
            # Append the job data to the list
            jobs.append({
                'title': job_title,
                'link': job_link,
                'description': job_description,
                'type': job_type,
                'misc': job_misc
            })
        
        print(f"Scraped {len(job_listings)} jobs from {page_url}")
        return jobs
    else:
        print(f"Failed to load page: {page_url}, Status code: {response.status_code}")
        return []

# Base URL for the first page
base_url = "https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page="

# Total number of pages
total_jobs = 599
jobs_per_page = 25
total_pages = math.ceil(total_jobs / jobs_per_page)  # 24 pages

# Set custom headers to simulate browser behavior
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initialize an empty list to store all jobs
all_jobs = []

# Loop through all the pages and fetch jobs from each page
for page in range(1, total_pages + 1):  # Pages 1 to 24
    page_url = f"{base_url}{page}"
    print(f"Scraping page {page} of {total_pages}: {page_url}")
    
    # Fetch jobs from the current page
    jobs_on_page = fetch_jobs_from_page(page_url, headers)
    
    # Add the jobs from the current page to the all_jobs list
    all_jobs.extend(jobs_on_page)
    
    # Introduce a random delay between 5 and 10 seconds before the next request
    delay = random.uniform(5, 10)
    print(f"Waiting for {delay:.2f} seconds before scraping the next page...")
    time.sleep(delay)

# Print the total number of jobs scraped
print(f"Total jobs scraped: {len(all_jobs)}")

# Define the CSV file path
csv_file_path = 'job_listings.csv'

# 


Scraping page 1 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=1
Scraped 25 jobs from https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=1
Waiting for 7.89 seconds before scraping the next page...
Scraping page 2 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=2
Scraped 25 jobs from https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=2
Waiting for 6.91 seconds before scraping the next page...
Scraping page 3 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=3
Scraped 25 jobs from https://dlr.concludis.de/pr

Scraping page 22 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=22
Scraped 25 jobs from https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=22
Waiting for 7.64 seconds before scraping the next page...
Scraping page 23 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=23
Scraped 25 jobs from https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=23
Waiting for 8.91 seconds before scraping the next page...
Scraping page 24 of 24: https://dlr.concludis.de/prj/lst/a181a603769c1f98ad927e7367c7aa51/GesamtlisteOffenePositionen.htm?b=0&boerse=&stellg1=&stellg2=&stellort=&page=24
Scraped 21 jobs from https://dlr.conclud