# Code for Scraping all Written Reviews in Kununu
This notebook contains code to scrape text reviews from Kununu's employee review pages.

### Workflow
1. **Parsing Individual Review Blocks (`parse_review_block`):**
   Extracts structured data from a single review, including subcategories and text of reviews.

2. **Fetching Reviews (`get_reviews_for_url`):**
   Scrapes all reviews for a given company across multiple pages, iterating through the total number of review pages.

3. **Batch Scraping (`scrape_all_reviews`):**
   Processes reviews for multiple companies in parallel using a thread pool for efficiency. Results are saved periodically to a JSON file.

### Output
- **Structured Review Data:** A JSON file containing all reviews for the specified companies.

### Notes
- The scraper relies on specific HTML structures defined in the `CSS_CLASSES` dictionary. Changes to Kununu’s website may require updates to these selectors.

In [3]:
# !pip install requests pandas tqdm python-dotenv beautifulsoup4
import pandas as pd
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import json
from datetime import datetime
import os
import requests
from dotenv import load_dotenv
from utils import *
from config import *

load_dotenv() # make sure to have a .env file that defines the variable 'SCRAPINGBEE_API_KEY' if using scrapingbee

# Set the number of threads to use for concurrent requests
CONCURRENCY = 100

df = pd.read_csv('data/kununu_data.csv', low_memory=False)[['url', 'employees_review_num']].dropna()

In [None]:
def parse_review_block(block, kn_url):
    review = {}
    review['kn_url'] = kn_url
    # Overall score
    score_el = block.select_one(CSS_CLASSES["overall_score"])
    review['overall_score'] = float(score_el.text.replace(',', '.')) if score_el else None
    
    # Title
    title_el = block.select_one(CSS_CLASSES["title"])
    review['title'] = title_el.text.strip() if title_el else None
    
    # Date
    date_el = block.select_one(CSS_CLASSES["date"])
    if date_el:
        date_str = date_el.get('datetime', '')
        date_parts = date_str.split('T')[0].split('-')
        review['year'] = int(date_parts[0])
        review['month'] = int(date_parts[1])
    else:
        review['year'] = None
        review['month'] = None
    
    # Recommendation
    rec_el = block.select_one(CSS_CLASSES["recommendation_block"])
    if rec_el:
        recommendation_text = rec_el.text.strip().lower()
    else:
        review['recommended'] = None

    # Employee Type and Position
    emp_info_el = block.select_one(CSS_CLASSES["employment_info"])
    if emp_info_el:
        emp_info_text = emp_info_el.text.strip()
        emp_type_el = emp_info_el.select_one('b')
        review['employee_type'] = emp_type_el.text.strip() if emp_type_el else None
        try:
            position_text = emp_info_el.text.replace(review['employee_type'], '', 1).strip()
            review['position'] = position_text if position_text else None
        except:
            review['position'] = None
    else:
        review['employee_type'] = None
        review['position'] = None

    # Subcategories
    review['subcategories'] = []
    factors = block.select(CSS_CLASSES["factor"])
    for f in factors:
        cat_title_el = f.select_one(CSS_CLASSES["factor_title"])
        if not cat_title_el:
            continue
        cat_title = cat_title_el.text.strip()
        text_el = f.select_one(CSS_CLASSES["factor_text"])
        cat_text = text_el.text.strip() if text_el else None
        review['subcategories'].append({cat_title: cat_text})

    return review

def get_reviews_for_url(kn_url, kn_employees_review_num):
    reviews = []
    total_pages = math.ceil(kn_employees_review_num / 10)
    for page_num in range(1, total_pages+1):
        url = f"{kn_url}/kommentare?{page_num}sort=newest"
        soup = soup_from_url(url)
        review_blocks = soup.select(CSS_CLASSES["review_block"])
        for block in review_blocks:
            parsed = parse_review_block(block, kn_url)
            if parsed.get('title'):
                reviews.append(parsed)
    return reviews

def scrape_all_reviews(df, save_path="data/scraped_reviews.json"):
    results = {}
    counter = 0  # Counter to track the number of links scraped
    progress_bar = tqdm(total=len(df), desc="Scraping reviews")  # Initialize tqdm

    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
        futures = {executor.submit(get_reviews_for_url, row['kn_url'], int(row['kn_employees_review_num'])): row['kn_url'] for _, row in df.iterrows()}

        for future in as_completed(futures):
            kn_url = futures[future]
            try:
                results[kn_url] = future.result()
                counter += 1
                progress_bar.update(1)  # Update tqdm for each completed task
                # Save results every 10,000 links scraped
                if counter % 10000 == 0:
                    with open(save_path, "w") as f:
                        json.dump(results, f)
            except Exception as e:
                print(f"Error scraping {kn_url}: {e}")
    
    progress_bar.close()  # Close tqdm after finishing

    # Final save at the end
    with open(save_path, "w") as f:
        json.dump(results, f)
    print("Final data saved.")
    return results

scrape_all_reviews(df)