In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import quote

# Configuration
JOBS = [
    "commerce & teleconseille",
    "maintenance informatique",
    "community management",
    "frontend developement",
    "Creation du jeu",
    "marketing digital",
    "Creation du contenu"
]

LOCATIONS = [
    "Morocco",
    "Europe",
    "Middle East",
    "USA",
    "Canada"
]

MAX_RESULTS_PER_QUERY = 1000  # Approximate max LinkedIn allows
RESULTS_PER_PAGE = 25
MAX_THREADS = 5
DELAY = random.uniform(1, 3)  # Delay between requests to avoid blocking

# User agents to rotate
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
]

def get_random_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

def scrape_job_listings(job_title, location):
    job_listings = []
    seen_job_ids = set()
    
    encoded_job = quote(job_title)
    encoded_location = quote(location)
    
    for start in range(0, MAX_RESULTS_PER_QUERY, RESULTS_PER_PAGE):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/{encoded_job}-jobs?keywords={encoded_job}&location={encoded_location}&start={start}"
        
        try:
            response = requests.get(list_url, headers=get_random_headers())
            time.sleep(DELAY)
            
            if response.status_code != 200:
                print(f"Failed to fetch listings for {job_title} in {location} at start={start}")
                break
                
            soup = BeautifulSoup(response.text, "html.parser")
            jobs = soup.find_all("li")
            
            if not jobs:
                break
                
            for job in jobs:
                base_card_div = job.find("div", {"class": "base-card"})
                if not base_card_div:
                    continue
                    
                job_id = base_card_div.get("data-entity-urn", "").split(":")[-1]
                if not job_id or job_id in seen_job_ids:
                    continue
                    
                seen_job_ids.add(job_id)
                job_listings.append({
                    "job_id": job_id,
                    "job_title": job_title,
                    "search_location": location
                })
                
        except Exception as e:
            print(f"Error scraping listings for {job_title} in {location}: {str(e)}")
            continue
            
    return job_listings

def scrape_job_details(job):
    job_url = f"https://www.linkedin.com/jobs/view/{job['job_id']}"
    job_post = {
        "job_id": job["job_id"],
        "search_job_title": job["job_title"],
        "search_location": job["search_location"],
        "job_title": None,
        "company_name": None,
        "company_url": None,
        "location": None,
        "time_posted": None,
        "num_applicants": None,
        "employment_type": None,
        "job_level": None,
        "job_description": None,
        "job_url": job_url
    }
    
    try:
        response = requests.get(job_url, headers=get_random_headers())
        time.sleep(DELAY)
        
        if response.status_code != 200:
            return job_post
            
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract job title
        title_element = soup.find("h1", {"class": "top-card-layout__title"})
        if title_element:
            job_post["job_title"] = title_element.text.strip()
        
        # Extract company info
        company_element = soup.find("a", {"class": "topcard__org-name-link"})
        if company_element:
            job_post["company_name"] = company_element.text.strip()
            job_post["company_url"] = company_element.get("href", "")
        
        # Extract location
        location_element = soup.find("span", {"class": "topcard__flavor--bullet"})
        if location_element:
            job_post["location"] = location_element.text.strip()
        
        # Extract posting time
        time_element = soup.find("span", {"class": "posted-time-ago__text"})
        if time_element:
            job_post["time_posted"] = time_element.text.strip()
        
        # Extract applicants
        applicants_element = soup.find("span", {"class": "num-applicants__caption"})
        if applicants_element:
            job_post["num_applicants"] = applicants_element.text.strip()
        
        # Extract employment type and level
        criteria_elements = soup.find_all("span", {"class": "description__job-criteria-text"})
        if criteria_elements and len(criteria_elements) >= 2:
            job_post["employment_type"] = criteria_elements[0].text.strip()
            job_post["job_level"] = criteria_elements[1].text.strip()
        
        # Extract job description
        description_element = soup.find("div", {"class": "show-more-less-html__markup"})
        if description_element:
            job_post["job_description"] = description_element.text.strip()
            
    except Exception as e:
        print(f"Error scraping job {job['job_id']}: {str(e)}")
    
    return job_post

def main():
    all_jobs = []
    
    # First phase: Collect all job listings
    for job_title in JOBS:
        for location in LOCATIONS:
            print(f"Scraping listings for: {job_title} in {location}")
            listings = scrape_job_listings(job_title, location)
            all_jobs.extend(listings)
            print(f"Found {len(listings)} listings for {job_title} in {location}")
    
    print(f"Total jobs to scrape: {len(all_jobs)}")
    
    # Second phase: Scrape details for each job (with threading)
    job_details = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(scrape_job_details, job) for job in all_jobs]
        for i, future in enumerate(futures):
            try:
                job_details.append(future.result())
                if i % 10 == 0:
                    print(f"Processed {i+1}/{len(all_jobs)} jobs")
            except Exception as e:
                print(f"Error processing job: {str(e)}")
    
    # Save to DataFrame and CSV
    df = pd.DataFrame(job_details)
    df.to_csv("linkedin_jobs.csv", index=False)
    print("Scraping complete. Data saved to linkedin_jobs.csv")
    
    return df

if __name__ == "__main__":
    df = main()

Scraping listings for: commerce & teleconseille in Morocco
Found 0 listings for commerce & teleconseille in Morocco
Scraping listings for: commerce & teleconseille in Europe
Found 0 listings for commerce & teleconseille in Europe
Scraping listings for: commerce & teleconseille in Middle East
Found 0 listings for commerce & teleconseille in Middle East
Scraping listings for: commerce & teleconseille in USA
Found 0 listings for commerce & teleconseille in USA
Scraping listings for: commerce & teleconseille in Canada
Found 0 listings for commerce & teleconseille in Canada
Scraping listings for: maintenance informatique in Morocco
Found 78 listings for maintenance informatique in Morocco
Scraping listings for: maintenance informatique in Europe
Found 15 listings for maintenance informatique in Europe
Scraping listings for: maintenance informatique in Middle East
Failed to fetch listings for maintenance informatique in Middle East at start=125
Found 50 listings for maintenance informatique 

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt


In [8]:
import seaborn as sns

In [4]:
df = pd.read_csv('linkedin_jobs.csv')

In [5]:
df

Unnamed: 0,job_id,search_job_title,search_location,job_title,company_name,company_url,location,time_posted,num_applicants,employment_type,job_level,job_description,job_url
0,4152448187,maintenance informatique,Morocco,Consultant(e) Office 365,Devoteam,https://fr.linkedin.com/company/devoteam?trk=p...,"cercle de El Borouj, Casablanca-Settat, Morocco",2 weeks ago,34 applicants,Mid-Senior level,Full-time,"Description de l'entrepriseChez Devoteam, nous...",https://www.linkedin.com/jobs/view/4152448187
1,4203665899,maintenance informatique,Morocco,Gestionnaire Parc Informatique (H/F),LabelVie,https://ma.linkedin.com/company/label'vie?trk=...,Casablanca Metropolitan Area,4 days ago,,Entry level,Full-time,Créé en 1986 et côté à la Bourse de Casablanca...,https://www.linkedin.com/jobs/view/4203665899
2,4171304027,maintenance informatique,Morocco,Local IT,Expleo Group,https://fr.linkedin.com/company/expleo-group?t...,"Tangier, Tanger-Tetouan-Al Hoceima, Morocco",10 hours ago,64 applicants,Entry level,Full-time,"Groupe EXPLEO, plus qu’une histoire, une avent...",https://www.linkedin.com/jobs/view/4171304027
3,4183850804,maintenance informatique,Morocco,Manager des systèmes métier,RATP Dev Casablanca,https://ma.linkedin.com/company/casatram-ratp-...,"Casablanca, Casablanca-Settat, Morocco",1 week ago,,Mid-Senior level,Full-time,Gestion et supervision des systèmes d’informat...,https://www.linkedin.com/jobs/view/4183850804
4,4195948121,maintenance informatique,Morocco,Consultant Junior Audit IT,PwC au Maroc,https://ma.linkedin.com/company/pwc-au-maroc?t...,"Casablanca, Casablanca-Settat, Morocco",2 weeks ago,184 applicants,Associate,Full-time,Ce que vous pouvez attendre de nous :Réaliser ...,https://www.linkedin.com/jobs/view/4195948121
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,4191107010,Creation du contenu,Canada,"Développeur(se) Senior, Infrastructure | Senio...",Unity,https://www.linkedin.com/company/unity?trk=pub...,"Montreal, Quebec, Canada",2 weeks ago,,Mid-Senior level,Full-time,"L'opportunitéChez Unity, notre mission va au-d...",https://www.linkedin.com/jobs/view/4191107010
2605,4159681944,Creation du contenu,Canada,Conseille(ère) Sécurité Principal(e) - Gestion...,Intact,https://ca.linkedin.com/company/intact?trk=pub...,"Montreal, Quebec, Canada",7 hours ago,,Mid-Senior level,Full-time,Nos équipes sont au coeur de tout ce que nous ...,https://www.linkedin.com/jobs/view/4159681944
2606,4184489641,Creation du contenu,Canada,"Manager, Data Science_fr",Intact,https://ca.linkedin.com/company/intact?trk=pub...,"Vancouver, British Columbia, Canada",5 days ago,,Mid-Senior level,Full-time,Nos équipes sont au coeur de tout ce que nous ...,https://www.linkedin.com/jobs/view/4184489641
2607,4203877778,Creation du contenu,Canada,"Consultant Senior / Directeur - Énergie, resso...",Deloitte,https://www.linkedin.com/company/deloitte?trk=...,"Montreal, Quebec, Canada",5 days ago,,Director,Full-time,Type de poste : PermanentModèle de travail : H...,https://www.linkedin.com/jobs/view/4203877778
