# Scraping all Kununu Websites for German Firms

Requires scraping three different pages:
- Main Page: https://www.kununu.com/de/[company name] 
- Total Views: https://www.kununu.com/middlewares/profiles/+[company uuid]+/statistics 
- Applicant Reviews: https://www.kununu.com/de/[company name]/bewerbung
- Employee Reviews: https://www.kununu.com/de/[company name]/kommentare

**Important Note**: This code works as of December 10th, 2024. Kununu may change their website structure, which would require updating the code.
In particular, it is likely that the CSS_CLASSES dictionary in config.py will need to be updated.

In [1]:
# !pip install requests beautifulsoup4 pandas numpy python-dotenv

from urllib.parse import urlencode
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from utils import *
from config import *
from dotenv import load_dotenv
import json
import os

# load_dotenv() # make sure to have a .env file that defines the variable 'SCRAPINGBEE_API_KEY' if using scrapingbee

In [2]:
CONCURRENCY = 100 # Number of concurrent requests to make when scraping

# importing all Kununu links
pwd = os.getcwd()
with open(pwd+"/data/all_kununu_company_profile_links.txt", "r") as file:
    FileContent = file.read()
all_kununu_links = FileContent.split("\n")

In [3]:
def get_stats_page(uuid: str, verbose: bool = True) -> dict:
    """Scrape stats page, return fully flattened dictionary with unstacked recommendation_rate."""
    response = requests.get(f"https://www.kununu.com/middlewares/profiles/{uuid}")
    data_dict = replace_null_with_none(response.text)
    data_dict = flatten(convert_keys_to_snake_case(data_dict))

    # Manually un-nest recommendation_rate
    recommendation_rate = data_dict.get("reviews", {}).get("recommendation_rate", {})
    recommendation_rate_unstacked = {
        "recommendation_rate_percentage": recommendation_rate.get("percentage"),
        "recommendation_rate_total_reviews": recommendation_rate.get("total_reviews"),
        "recommendation_rate_recommended_total_reviews": recommendation_rate.get("recommended_total_reviews"),
        "recommendation_rate_not_recommended_total_reviews": recommendation_rate.get("not_recommended_total_reviews"),
    }
    data_dict.update(recommendation_rate_unstacked)
    data_dict.pop("reviews_recommendation_rate", None)

    data_dict = {key: data_dict.get(key) for key in required_keys}

    return data_dict

def main_page_scrape(url: str, verbose: bool = True) -> dict:
    """
    Takes in a URL and returns a dictionary with all the information scraped from the URL to ratings_list.
    Information collected:
    - Firm name
    - Firm uuid
    - Number of views
    - Overall rating
    - Percentage of people who would recommend the firm
    - Total number of reviews
    - Number of salaries posted
    - Number of corporate culture reviews
    - Ratings for each category
    - Benefits
    """
    result_dict = {}
    soup = soup_from_url(url)

    result_dict["url"] = url
    result_dict["uuid"] = str(soup).split('"uuid":"')[1].split('"')[0] if '"uuid":"' in str(soup) else None

    num_revs_soup = re.findall(r'\(.*?\)', str(soup.find(class_=CSS_CLASSES["tabs"]).text)) if soup.find(class_=CSS_CLASSES["tabs"]) else None
    if num_revs_soup:
        num_revs_ls = [int(x.replace("(", "").replace(")", "").replace(".", "")) for x in num_revs_soup]
        result_dict["salaries_posted_num"] = num_revs_ls[1] if len(num_revs_ls) > 1 else np.nan
        result_dict["corporate_culture_review_num"] = num_revs_ls[2] if len(num_revs_ls) > 2 else np.nan
    else:
        if verbose:
            print("Failed to retrieve number of reviews, salaries posted, or corporate culture reviews.")
        result_dict["total_reviews_num"], result_dict["salaries_posted_num"], result_dict["corporate_culture_review_num"] = np.nan, np.nan, np.nan
    
    if verbose:
        try:    
            # Retrieve Satisfied Salary Percentage
            satisfied_salary_section = soup.find(class_=CSS_CLASSES["satisfied_salary"])
            
            # Extract the percentage value, if available
            if satisfied_salary_section:
                result_dict["satisfied_salary_pct"] = int(
                    satisfied_salary_section.text.split()[0].strip().replace("%", "")
                )
            else:
                raise ValueError("Satisfied salary section not found in the HTML. Ensure that the page structure has not changed.")
        except Exception as e:
            if verbose:
                print(f"Failed to retrieve satisfied salary percentage: {e}")
            result_dict["satisfied_salary_pct"] = np.nan


    if verbose:
        try:
            benefits_section = soup.find(class_=CSS_CLASSES["benefits"])
            
            if not benefits_section:
                raise ValueError("Benefits section not found in the HTML. Ensure that the page structure has not changed.")
            
            benefit_items = benefits_section.find_all("li", class_=CSS_CLASSES["benefit_items"])
            
            if not benefit_items:
                raise ValueError("No benefit items found in the benefits section. Verify if benefits are listed properly or the HTML class has been updated.")
            
            for benefit_item in benefit_items:
                try:
                    benefit_name = benefit_item.find("span", class_=CSS_CLASSES["benefit_title"]).text.strip()
                    benefit_percentage = benefit_item.find("div", class_=CSS_CLASSES["benefit_percentage"]).find("span", class_="legend-regular").text.strip().replace("%", "")
                    result_dict[benefit_name] = int(benefit_percentage)
                except AttributeError as inner_e:
                    if verbose:
                        print(f"Failed to extract name or percentage for a benefit item: {inner_e}")
                    continue

        except Exception as e:
            if verbose:
                print(f"Failed to retrieve benefits data: {e}")
    else:
        benefits_section = soup.find(class_=CSS_CLASSES["benefits"])
        if benefits_section:
            benefit_items = benefits_section.find_all("li", class_=CSS_CLASSES["benefit_items"])
            for benefit_item in benefit_items:
                try:
                    benefit_name = benefit_item.find("span", class_=CSS_CLASSES["benefit_title"]).text.strip()
                    benefit_percentage = benefit_item.find("div", class_=CSS_CLASSES["benefit_percentage"]).find("span", class_="legend-regular").text.strip().replace("%", "")
                    result_dict[benefit_name] = int(benefit_percentage)
                except AttributeError:
                    continue

    ratings_raw = [x.parent.text for x in soup.find_all(class_=re.compile(CSS_CLASSES["factor_score"]))]
    categories = [rating[3:] for rating in ratings_raw]
    ratings = [float(str(rating[:3].replace(",", "."))) for rating in ratings_raw]
    result_dict.update(dict(zip(categories, ratings)))

    return result_dict

def get_applicant_info(url: str, verbose: bool = True) -> dict:
    """
    Takes in a url and returns the review scores and number of reviews by applicants to the company,
    separated by the following categories: "hired", "rejected", "offerDeclined", "deferred".
    """
    application_outcomes = ["hired", "rejected", "offerDeclined", "deferred"]
    reviews_by_applicants = {}

    for outcome in ["all_applicants"] + application_outcomes:
        soup = soup_from_url(f"{url}/bewerbung{'?result=' + outcome if outcome != 'all_applicants' else ''}")
        try:
            reviews_by_applicants[f"{outcome}_review_num"] = int(soup.find(class_=CSS_CLASSES["total_reviews"]).text.split(" ")[0])
            reviews_by_applicants[f"{outcome}_review_score"] = float(soup.find(class_=CSS_CLASSES["aggregation"]).text[:3].replace(",", "."))
        except Exception as e:
            if verbose:
                print(f"Failed to retrieve review number or score for outcome '{outcome}': {e}")
            reviews_by_applicants[f"{outcome}_review_num"] = np.nan
            reviews_by_applicants[f"{outcome}_review_score"] = np.nan

    return reviews_by_applicants

def get_employee_info(url: str, verbose: bool = True) -> dict:
    """
    Takes in a url and returns the review scores, number of reviews by employees to the company,
    and percent of employees that would recommend the company.
    """
    reviews_by_employees = {
        "sehr_gut_reviews": np.nan,
        "gut_reviews": np.nan,
        "befriedigend_reviews": np.nan,
        "genuegend_reviews": np.nan,
    }
    soup = soup_from_url(f"{url}/kommentare")

    try:
        reviews_by_employees["employees_review_num"] = int(soup.find(class_=CSS_CLASSES["total_reviews"]).text.split(" ")[0].replace(".", ""))
        reviews_by_employees["employee_review_score"] = float(soup.find(class_=CSS_CLASSES["employee_score"]).text[:3].replace(",", "."))
        reviews_by_employees["employee_rec_score"] = int(soup.find(class_=CSS_CLASSES["employee_recommendation"]).text.split("%")[0])

        # Adding employee review scores for each category
        category_buttons = soup.find_all("button", {"aria-label": "Reviews score detail"})
        for button in category_buttons:
            category_name_german = button.find("span", class_="index__category__fvg57").text.strip()
            review_count_text = button.find("span", class_="index__totalReviews__6pCSR").text.split(" ")[0]
            review_count = int(review_count_text.replace(".", "").replace(",", ""))

            # Match against German category names
            if category_name_german == "Sehr gut":
                reviews_by_employees["sehr_gut_reviews"] = review_count
            elif category_name_german == "Gut":
                reviews_by_employees["gut_reviews"] = review_count
            elif category_name_german == "Befriedigend":
                reviews_by_employees["befriedigend_reviews"] = review_count
            elif category_name_german == "Genügend":
                reviews_by_employees["genuegend_reviews"] = review_count

    except Exception as e:
        if verbose:
            print(f"Failed to retrieve employee review information: {e}")
        reviews_by_employees["employees_review_num"] = np.nan
        reviews_by_employees["employee_review_score"] = np.nan
        reviews_by_employees["employee_rec_score"] = np.nan

    return reviews_by_employees

def get_all_info(url: str, verbose: bool = True) -> pd.DataFrame:
    """
    Takes in a url and returns all the information scraped from the URL to a pandas DataFrame.
    """
    result_dict = main_page_scrape(url, verbose=verbose)
    result_dict.update(get_applicant_info(url, verbose=verbose))
    result_dict.update(get_employee_info(url, verbose=verbose))
    result_dict.update(get_stats_page(result_dict["uuid"]))
    df = pd.DataFrame([result_dict])
    df.columns = [x.replace(" ", "_").replace("-", "_").replace("/", "").replace("ä", "a").lower() for x in df.columns]
    df = df.rename(columns=column_name_mapping)
    return df


In [None]:
# Scraping all data in parallel and saving to csv
window_size = 5
for i in range(0, len(all_kununu_links)//window_size+1):
    concurrency = CONCURRENCY
    pool = ThreadPool(concurrency)
    df = pool.map(get_all_info, all_kununu_links[i*window_size:min((i+1)*window_size, len(all_kununu_links))])
    pool.close()
    pool.join()

    df.to_csv(f"{pwd}/data/kununu_data_{i+1}.csv", index=False)
    print(f"Saved results to data/kununu_data_{i+1}.csv")

# Consolidate all results
df = pd.concat([pd.read_csv(f"{pwd}/data/kununu_data_{i+1}.csv") for i in range(0, len(all_kununu_links)//window_size+1)], ignore_index=True)
df.to_csv(f"{pwd}/data/kununu_data.csv", index=False)
print("Saved results to data/kununu_data.csv")