In [None]:
# you have to use python 3.12 to get it working

!pip uninstall numpy pandas jobspy -y
!pip install numpy==1.26.4
!pip install python-jobspy

In [None]:
"""
German IT Job Scraper - Bilingual (English + German)
Scrapes jobs in both English and German
"""

import csv
import json
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import pandas as pd
from jobspy import scrape_jobs

# ============================================================================
# COLORED LOGGING
# ============================================================================

class ColoredFormatter(logging.Formatter):
    """Custom formatter with colors"""

    COLORS = {
        'DEBUG': '\033[36m',
        'INFO': '\033[32m',
        'WARNING': '\033[33m',
        'ERROR': '\033[31m',
        'CRITICAL': '\033[35m',
        'RESET': '\033[0m',
    }

    EMOJI = {
        'DEBUG': 'üîç',
        'INFO': '‚úÖ',
        'WARNING': '‚ö†Ô∏è',
        'ERROR': '‚ùå',
        'CRITICAL': 'üî•',
    }

    def format(self, record):
        levelname = record.levelname
        color = self.COLORS.get(levelname, self.COLORS['RESET'])
        emoji = self.EMOJI.get(levelname, '')
        record.levelname = f"{emoji} {levelname}"
        message = super().format(record)
        return f"{color}{message}{self.COLORS['RESET']}"


# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class ScraperConfig:
    """Configuration for the job scraper"""

    # Job boards to scrape
    job_boards: List[str] = field(default_factory=lambda: ["indeed", "linkedin"])
    # job_boards: List[str] = field(default_factory=lambda: ["linkedin"])
    # job_boards: List[str] = field(default_factory=lambda: ["indeed"])

    # Cities to scrape
    cities: List[str] = field(default_factory=lambda: [
        "Stuttgart", "Munich", "Berlin", "Potsdam", "Bremen", "Hamburg",
        "Frankfurt", "Hanover", "Rostock", "Cologne", "Mainz", "Saarbr√ºcken",
        "Dresden", "Magdeburg", "Kiel", "Erfurt",
        "D√ºsseldorf", "Dortmund", "Essen", "Leipzig", "N√ºrnberg",
        "Karlsruhe", "Mannheim", "Augsburg", "Wiesbaden", "M√ºnster",
        "Bonn", "Freiburg", "Aachen", "Heidelberg", "Ulm", "Darmstadt",
        "Regensburg", "Bielefeld"
    ])

    # Bilingual job search keywords (English + German)
    keywords: List[str] = field(default_factory=lambda: [
        # English keywords
        "software engineer",
        "software architect",
        "software developer",
        "data engineer",
        "Data analyst",
        "data scientist",
        "BI developer",
        "Cloud Engineer",
        "Cloud architect",
        "DevOps engineer",
        "IT administrator",
        "backend developer",
        "frontend developer",
        "full stack developer",
        "Apps developer",
        "SAP developer",
        "machine learning engineer",
        "AI engineer",
        "cybersecurity engineer",

        # German keywords (Softwareentwickler, etc.)
       "Softwareentwickler",
       "Softwarearchitekt",
       "Softwareentwickler",
       "Data Engineer",
       "Datenanalyst",
       "Data Scientist",
       "BI-Entwickler",
       "Cloud Engineer",
       "Cloud-Architekt",
       "DevOps Engineer",
       "IT-Administrator",
       "Backend Entwickler",
       "Frontend Entwickler",
       "FullStack Entwickler",
       "Apps Entwickler",
       "SAP Entwickler",
       "Machine Learning Engineer",
       "KI-Ingenieur",
       "IT-Security Engineer",
    ])

    # Scraping parameters
    results_per_search: int = 150
    job_type: str = "fulltime"
    distance_km: int = 50
    hours_old: int = 720 * 4  # 120 days
    country: str = "Germany"

    # LinkedIn specific
    linkedin_fetch_description: bool = False

    # Rate limiting
    request_delay: int = 3
    linkedin_delay: int = 6  # LinkedIn is more restrictive
    retry_delay: int = 5
    max_retries: int = 3

    # Deduplication across languages
    remove_duplicates: bool = True  # Remove duplicates found in both EN/DE searches

    # Output directory
    output_dir: Path = field(default_factory=lambda: Path("job_data"))

    def __post_init__(self):
        """Create output directory"""
        self.output_dir.mkdir(parents=True, exist_ok=True)


# ============================================================================
# LOGGING SETUP
# ============================================================================

def setup_logging(config: ScraperConfig) -> logging.Logger:
    """Setup colored logging"""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    log_file = config.output_dir / f"Log_{timestamp}.log"

    logger = logging.getLogger("JobScraper")
    logger.setLevel(logging.DEBUG)

    if logger.handlers:
        logger.handlers.clear()

    # File handler
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        '%(asctime)s | %(levelname)-8s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    file_handler.setFormatter(file_formatter)

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(ColoredFormatter('%(levelname)s %(message)s'))

    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

    return logger


# ============================================================================
# UTILITY
# ============================================================================

def get_timestamp() -> str:
    """Get formatted timestamp"""
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


def format_duration(seconds: float) -> str:
    """Format duration"""
    if seconds < 60:
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        return f"{seconds/60:.1f}m"
    else:
        return f"{seconds/3600:.1f}h"


# ============================================================================
# MULTI-SITE BILINGUAL JOB SCRAPER
# ============================================================================

class BilingualJobScraper:
    """Fetch raw job data from multiple job boards in English and German"""

    def __init__(self, config: ScraperConfig):
        self.config = config
        self.logger = setup_logging(config)
        self.raw_data: Dict[str, List[pd.DataFrame]] = {
            board: [] for board in config.job_boards
        }
        self.stats = {
            'start_time': None,
            'end_time': None,
            'total_searches': 0,
            'by_board': {board: {'successful': 0, 'failed': 0, 'jobs_raw': 0, 'jobs_deduped': 0}
                        for board in config.job_boards}
        }

    def run(self, download_in_colab: bool = True) -> Dict[str, Tuple[Path, Path]]:
        """
        Scrape jobs from all configured job boards

        Returns:
            Dictionary mapping board name to (csv_path, json_path)
        """
        self.stats['start_time'] = datetime.now()

        self._print_header()

        try:
            # Scrape from each job board
            for board in self.config.job_boards:
                self.logger.info(f"\n{'='*80}")
                self.logger.info(f"üåê Starting scrape from: {board.upper()}")
                self.logger.info(f"{'='*80}")

                self._scrape_board(board)

            # Save data for each board
            saved_files = {}
            for board in self.config.job_boards:
                if self.raw_data[board]:
                    combined_df = pd.concat(self.raw_data[board], ignore_index=True)
                    self.stats['by_board'][board]['jobs_raw'] = len(combined_df)

                    # Optional deduplication
                    if self.config.remove_duplicates:
                        original_count = len(combined_df)
                        combined_df = combined_df.drop_duplicates(subset=['job_url'], keep='first')
                        duplicates = original_count - len(combined_df)
                        self.logger.info(f"\nüßπ {board.upper()}: Removed {duplicates} duplicates "
                                       f"({original_count} ‚Üí {len(combined_df)})")
                        self.stats['by_board'][board]['jobs_deduped'] = len(combined_df)
                    else:
                        self.stats['by_board'][board]['jobs_deduped'] = len(combined_df)

                    csv_path, json_path = self._save_data(combined_df, board)
                    saved_files[board] = (csv_path, json_path)

                    # Download if in Colab
                    if download_in_colab:
                        self._download_files(csv_path, json_path)
                else:
                    self.logger.warning(f"No data collected from {board}")

            # Final summary
            self.stats['end_time'] = datetime.now()
            self._print_summary(saved_files)

            return saved_files

        except Exception as e:
            self.logger.error(f"Fatal error: {str(e)}", exc_info=True)
            raise

    def _print_header(self):
        """Print startup header"""
        self.logger.info("="*80)
        self.logger.info("üöÄ BILINGUAL GERMAN JOB SCRAPER (EN + DE)")
        self.logger.info("="*80)
        self.logger.info(f"üåê Job Boards: {', '.join(self.config.job_boards)}")
        self.logger.info(f"üìç Cities: {len(self.config.cities)}")
        self.logger.info(f"üîë Keywords: {len(self.config.keywords)} (English + German)")
        self.logger.info(f"üìÖ Date Range: Last {self.config.hours_old//24} days")

        total_searches = len(self.config.job_boards) * len(self.config.cities) * len(self.config.keywords)
        self.stats['total_searches'] = total_searches

        self.logger.info(f"üéØ Total searches: {total_searches}")
        self.logger.info("="*80)

    def _scrape_board(self, board: str):
        """Scrape all cities and keywords for a specific job board"""
        searches_for_board = len(self.config.cities) * len(self.config.keywords)
        current_search = 0

        for city in self.config.cities:
            for keyword in self.config.keywords:
                current_search += 1

                # Detect language for logging
                lang = "üá©üá™ DE" if self._is_german_keyword(keyword) else "üá¨üáß EN"

                self.logger.info(
                    f"üîé [{current_search}/{searches_for_board}] "
                    f"{board.upper()} {lang}: '{keyword}' in {city}"
                )

                # Try with retries
                for attempt in range(self.config.max_retries):
                    try:
                        jobs = self._scrape_single(board, city, keyword)

                        if jobs is not None and len(jobs) > 0:
                            self.logger.info(f"   ‚úì Found {len(jobs)} jobs")
                            self.raw_data[board].append(jobs)
                            self.stats['by_board'][board]['successful'] += 1
                            break
                        else:
                            self.logger.warning(f"   ‚úó No jobs found")
                            self.stats['by_board'][board]['failed'] += 1
                            break

                    except Exception as e:
                        self.logger.error(f"   ‚úó Attempt {attempt + 1} failed: {str(e)}")
                        if attempt < self.config.max_retries - 1:
                            self.logger.info(f"   ‚è≥ Retrying in {self.config.retry_delay}s...")
                            time.sleep(self.config.retry_delay)
                        else:
                            self.stats['by_board'][board]['failed'] += 1

                # Board-specific delay
                delay = self.config.linkedin_delay if board == "linkedin" else self.config.request_delay
                time.sleep(delay)

    def _is_german_keyword(self, keyword: str) -> bool:
        """Check if keyword is in German"""
        german_indicators = [
            'entwickler', 'ingenieur', 'wissenschaftler', 'administrator',
            'spezialist', 'systemadministrator'
        ]
        return any(indicator in keyword.lower() for indicator in german_indicators)

    def _scrape_single(self, board: str, city: str, keyword: str) -> Optional[pd.DataFrame]:
        """Execute a single job search"""
        params = {
            'site_name': [board],
            'search_term': keyword,
            'location': city,
            'results_wanted': self.config.results_per_search,
            'job_type': self.config.job_type,
            'distance': self.config.distance_km,
            'verbose': 0
        }

        # Add board-specific parameters
        if board == "indeed":
            params['country_indeed'] = self.config.country
            params['hours_old'] = self.config.hours_old

        elif board == "linkedin":
            params['hours_old'] = self.config.hours_old
            params['linkedin_fetch_description'] = self.config.linkedin_fetch_description

        return scrape_jobs(**params)

    def _save_data(self, df: pd.DataFrame, board: str) -> Tuple[Path, Path]:
        """Save data to CSV and JSON"""
        timestamp = get_timestamp()
        csv_path = self.config.output_dir / f"Raw_Jobs_{board.upper()}_{timestamp}.csv"
        json_path = self.config.output_dir / f"Raw_Jobs_{board.upper()}_{timestamp}.json"

        self.logger.info(f"\nüíæ Saving {board.upper()} data...")

        # Save CSV
        df.to_csv(csv_path, index=False, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\")
        self.logger.info(f"   ‚úì CSV saved: {csv_path.name} ({len(df)} jobs)")

        # Save JSON
        df.to_json(json_path, orient='records', indent=2, force_ascii=False)
        self.logger.info(f"   ‚úì JSON saved: {json_path.name}")

        return csv_path, json_path

    def _download_files(self, csv_path: Path, json_path: Path):
        """Download files in Colab"""
        try:
            from google.colab import files
            files.download(str(csv_path))
            files.download(str(json_path))
        except ImportError:
            pass
        except Exception as e:
            self.logger.warning(f"   Could not download: {str(e)}")

    def _print_summary(self, saved_files: Dict[str, Tuple[Path, Path]]):
        """Print final summary"""
        duration = (self.stats['end_time'] - self.stats['start_time']).total_seconds()

        self.logger.info("\n" + "="*80)
        self.logger.info("‚úÖ SCRAPING COMPLETE!")
        self.logger.info("="*80)
        self.logger.info(f"‚è±Ô∏è  Duration: {format_duration(duration)}")

        # Stats by board
        total_jobs = 0
        for board in self.config.job_boards:
            stats = self.stats['by_board'][board]
            jobs = stats['jobs_deduped']
            jobs_raw = stats['jobs_raw']
            total_jobs += jobs

            self.logger.info(f"\nüìä {board.upper()}:")
            self.logger.info(f"   ‚úì Successful searches: {stats['successful']}")
            self.logger.info(f"   ‚úó Failed searches: {stats['failed']}")
            self.logger.info(f"   üì¶ Jobs collected: {jobs_raw} raw ‚Üí {jobs} unique")

        self.logger.info(f"\nüì¶ TOTAL UNIQUE JOBS: {total_jobs}")

        # Files
        self.logger.info("\nüìÅ SAVED FILES:")
        for board, (csv_path, json_path) in saved_files.items():
            self.logger.info(f"\n{board.upper()}:")
            self.logger.info(f"   üìÑ {csv_path.name}")
            self.logger.info(f"   üìÑ {json_path.name}")

        self.logger.info(f"\nüìÇ Location: {self.config.output_dir.absolute()}")
        self.logger.info("="*80)

        # Save overall metadata
        self._save_metadata(total_jobs)

    def _save_metadata(self, total_jobs: int):
        """Save scraping metadata"""
        timestamp = get_timestamp()
        metadata_path = self.config.output_dir / f"Metadata_{timestamp}.json"

        metadata = {
            'scrape_date': datetime.now().isoformat(),
            'job_boards': self.config.job_boards,
            'cities': self.config.cities,
            'keywords': self.config.keywords,
            'total_unique_jobs': total_jobs,
            'bilingual': True,
            'stats': self.stats
        }

        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2, default=str)

        self.logger.info(f"üìã Metadata saved: {metadata_path.name}")


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

if __name__ == "__main__":

    # # ========================================================================
    # # EXAMPLE 1: Quick test (2 cities, few keywords, English + German)
    # # ========================================================================
    # print("\nüß™ Quick bilingual test...\n")

    # test_config = ScraperConfig(
    #     job_boards=["indeed"],  # Start with Indeed only
    #     cities=["Berlin", "Munich"],
    #     keywords=[
    #         "software engineer",      # English
    #         "Softwareentwickler",     # German
    #         "data engineer",          # English
    #         "Dateningenieur",         # German
    #     ],
    #     results_per_search=30,
    #     hours_old=168,  # Last 7 days
    #     remove_duplicates=True
    # )

    # scraper = BilingualJobScraper(test_config)
    # files = scraper.run(download_in_colab=True)


    # ========================================================================
    # EXAMPLE 2: Full bilingual run (Indeed + LinkedIn)
    # ========================================================================
    # print("\nüéØ Full bilingual scrape (Indeed + LinkedIn)...\n")

    scraper = BilingualJobScraper(ScraperConfig())
    files = scraper.run(download_in_colab=True)




[32m‚úÖ INFO    ‚úì Found 30 jobs[0m
[32m‚úÖ INFO üîé [4/8] INDEED üá©üá™ DE: 'Dateningenieur' in Berlin[0m
[32m‚úÖ INFO üîé [5/8] INDEED üá¨üáß EN: 'software engineer' in Munich[0m
[32m‚úÖ INFO    ‚úì Found 30 jobs[0m
[32m‚úÖ INFO üîé [6/8] INDEED üá©üá™ DE: 'Softwareentwickler' in Munich[0m
[32m‚úÖ INFO    ‚úì Found 16 jobs[0m
[32m‚úÖ INFO üîé [7/8] INDEED üá¨üáß EN: 'data engineer' in Munich[0m
[32m‚úÖ INFO    ‚úì Found 30 jobs[0m
[32m‚úÖ INFO üîé [8/8] INDEED üá©üá™ DE: 'Dateningenieur' in Munich[0m
[32m‚úÖ INFO 
üßπ INDEED: Removed 26 duplicates (140 ‚Üí 114)[0m
[32m‚úÖ INFO 
üíæ Saving INDEED data...[0m
[32m‚úÖ INFO    ‚úì CSV saved: Raw_Jobs_INDEED_2026-01-29_15-38-39.csv (114 jobs)[0m
[32m‚úÖ INFO    ‚úì JSON saved: Raw_Jobs_INDEED_2026-01-29_15-38-39.json[0m
[32m‚úÖ INFO 
[32m‚úÖ INFO ‚úÖ SCRAPING COMPLETE![0m
[32m‚úÖ INFO ‚è±Ô∏è  Duration: 31.0s[0m
[32m‚úÖ INFO 
üìä INDEED:[0m
[32m‚úÖ INFO    ‚úì Successful searches: 6[0m
