In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import random

headers = {'User-Agent': 'Mozilla/5.0'}
jobs_list = []

germany_cities = [
    "Germany", "Berlin", "München", "Hamburg", "Frankfurt", "Köln", "Stuttgart", "Düsseldorf",
    "Leipzig", "Dresden", "Hannover", "Nürnberg", "Bremen", "Essen", "Dortmund"
]

keywords = [
    "Junior Data Analyst", "Data Analyst", "Business Intelligence Analyst", "Werkstudent Data", "Praktikum Data"
]

for city in germany_cities:
    for kw in keywords:

        for start in range(0, 100, 25):
            kw_encoded = kw.replace(" ", "%20")
            city_encoded = city.replace(" ", "%20").replace("ü", "u").replace("ö", "o").replace("ä", "a").replace("ß", "ss")
            url = f'https://www.linkedin.com/jobs/search/?keywords={kw_encoded}&location={city_encoded}&start={start}'
            print(f'🔍 Scraping: {url}')
            try:
                response = requests.get(url, headers=headers, timeout=15)
                soup = BeautifulSoup(response.text, 'html.parser')
                job_cards = soup.find_all('div', class_='base-card')
                print(f'   ➤ Found: {len(job_cards)} job listings')

                for job in job_cards:
                    title = job.find('h3', class_='base-search-card__title').text.strip() if job.find('h3', class_='base-search-card__title') else ''
                    company = job.find('h4', class_='base-search-card__subtitle').text.strip() if job.find('h4', class_='base-search-card__subtitle') else ''
                    location = job.find('span', class_='job-search-card__location').text.strip() if job.find('span', class_='job-search-card__location') else ''
                    link = job.find('a', class_='base-card__full-link')['href'] if job.find('a', class_='base-card__full-link') else ''
                    jobs_list.append({
                        'Title': title,
                        'Company': company,
                        'Location': location,
                        'Link': link,
                        'Keyword': kw,
                        'City': city
                    })
                time.sleep(random.uniform(2, 5))  

            except Exception as e:
                print("❌ Error:", e)
                time.sleep(10) 


print("📝 Fetching job descriptions...")
descriptions = []

for i, job in enumerate(jobs_list):
    url = job['Link']
    print(f"{i+1}/{len(jobs_list)} | {url}")
    try:
        response = requests.get(url, headers=headers, timeout=12)
        soup = BeautifulSoup(response.text, 'html.parser')
        desc_block = soup.find('div', {'class': lambda x: x and 'description' in x})
        if desc_block:
            description = desc_block.get_text(separator=' ', strip=True)
        else:
            description = 'Not found'
    except Exception as e:
        print("❌ Error:", e)
        description = 'Error'
    descriptions.append(description)
    time.sleep(random.uniform(1.5, 4))


df = pd.DataFrame(jobs_list)
df['Description'] = descriptions


df['description_lower'] = df['Description'].fillna('').str.lower()


df['Remote'] = df['description_lower'].apply(
    lambda x: any(word in x for word in ['remote', 'home office', 'hybrid', 'remotearbeit', 'homeoffice'])
)


skills_flags = [
    'python', 'sql', 'excel', 'power bi', 'tableau', 'r', 'git', 'vba', 'sas',
    'datenbank', 'datenanalyse', 'modellierung', 'ms office', 'englisch', 'deutsch', 'cloud', 'bi', 'sap', 'statistik',
    'big data', 'machine learning', 'etl', 'matplotlib', 'pandas', 'numpy', 'dashboard', 'reporting'
]
for skill in skills_flags:
    col_name = f'Has_{skill.replace(" ", "_").upper()}'
    df[col_name] = df['description_lower'].apply(lambda x: skill in x)


def seniority_detect(text):
    levels = {
        'Werkstudent': ['werkstudent', 'working student'],
        'Praktikum': ['praktikum', 'internship', 'praktikant'],
        'Junior': ['junior'],
        'Senior': ['senior', 'lead', 'principal'],
        'Manager': ['manager', 'leitung', 'leiter', 'head'],
        'Entry': ['entry', 'anfänger', 'einsteiger']
    }
    text = text.lower()
    for lvl, keys in levels.items():
        if any(k in text for k in keys):
            return lvl
    return 'Other'

df['Seniority'] = df['description_lower'].apply(seniority_detect)

def detect_language(text):
    text = text.lower()
    en_count = sum(word in text for word in ['responsibilities', 'requirements', 'apply', 'english'])
    de_count = sum(word in text for word in ['anforderungen', 'deutsch', 'bewerben', 'verantwortung', 'aufgaben'])
    if en_count > de_count and en_count >= 2:
        return 'en'
    elif de_count > en_count and de_count >= 2:
        return 'de'
    else:
        return 'mixed'

df['Lang'] = df['description_lower'].apply(detect_language)
df.drop(columns=['description_lower'], inplace=True)


df.to_csv("linkedin_jobs_germany_enriched.csv", index=False)
print("✅ Fertig! Gespeichert:", len(df), "Jobs")



🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Junior%20Data%20Analyst&location=Germany&start=0
   ➤ Found: 7 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Junior%20Data%20Analyst&location=Germany&start=25
   ➤ Found: 7 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Junior%20Data%20Analyst&location=Germany&start=50
   ➤ Found: 34 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Junior%20Data%20Analyst&location=Germany&start=75
   ➤ Found: 7 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&location=Germany&start=0
   ➤ Found: 50 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&location=Germany&start=25
   ➤ Found: 7 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&location=Germany&start=50
   ➤ Found: 7 job listings
🔍 Scraping: https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&l