In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Setup headers for HTTP requests
headers = {'User-Agent': 'Mozilla/5.0'}
jobs_list = []

# Loop through 10 pages (each page has 25 job results)
for start in range(0, 250, 25):
    url = f'https://www.linkedin.com/jobs/search/?keywords=Junior%20Data%20Analyst&location=Germany&start={start}'
    print(f'🔍 Scraping: {url}')
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    job_cards = soup.find_all('div', class_='base-card')

    print(f'   ➤ Found: {len(job_cards)} job listings')

    for job in job_cards:
        # Extract job title, company, location, and job link
        try:
            title = job.find('h3', class_='base-search-card__title').text.strip()
        except:
            title = ''
        try:
            company = job.find('h4', class_='base-search-card__subtitle').text.strip()
        except:
            company = ''
        try:
            location = job.find('span', class_='job-search-card__location').text.strip()
        except:
            location = ''
        try:
            link = job.find('a', class_='base-card__full-link')['href']
        except:
            link = ''

        jobs_list.append({
            'Title': title,
            'Company': company,
            'Location': location,
            'Link': link
        })

    time.sleep(2)  # Pause between pages to avoid getting blocked

# Collect job descriptions
print("📝 Fetching job descriptions...")
descriptions = []

for i, job in enumerate(jobs_list):
    url = job['Link']
    print(f"{i+1}/{len(jobs_list)} | {url}")
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        desc_block = soup.find('div', {'class': lambda x: x and 'description' in x})
        if desc_block:
            description = desc_block.get_text(separator=' ', strip=True)
        else:
            description = 'Not found'
    except Exception as e:
        print("❌ Error:", e)
        description = 'Error'
    descriptions.append(description)
    time.sleep(1.5)  # Be respectful with delays

# Create a DataFrame and attach descriptions
df = pd.DataFrame(jobs_list)
df['Description'] = descriptions

# Normalize text for analysis
df['description_lower'] = df['Description'].fillna('').str.lower()

# Flag if remote/hybrid work is mentioned
df['Remote'] = df['description_lower'].apply(
    lambda x: any(word in x for word in ['remote', 'home office', 'hybrid'])
)

# Detect presence of specific skills in job descriptions
skills_flags = ['python', 'sql', 'excel', 'power bi', 'tableau', 'r', 'git', 'vba', 'sas']
for skill in skills_flags:
    col_name = f'Has_{skill.replace(" ", "_").upper()}'
    df[col_name] = df['description_lower'].apply(lambda x: skill in x)

# Basic language detection based on keywords
def detect_language(text):
    text = text.lower()
    en_count = sum(word in text for word in ['responsibilities', 'requirements', 'apply', 'english'])
    de_count = sum(word in text for word in ['anforderungen', 'deutsch', 'bewerben', 'verantwortung'])
    if en_count > de_count and en_count >= 2:
        return 'en'
    elif de_count > en_count and de_count >= 2:
        return 'de'
    else:
        return 'mixed'

df['Lang'] = df['description_lower'].apply(detect_language)

# Drop intermediate column
df.drop(columns=['description_lower'], inplace=True)

# Save final dataset to CSV
df.to_csv("linkedin_jobs_enriched.csv", index=False)
print("✅ Done! Total jobs saved:", len(df))