# Scraping all Kununu Websites for German Firms

Information to scrape:

- Firm name
- Firm uuid
- Number of views
- Overall rating
- Percentage of people who would recommend the firm
- Total number of reviews
- Number of salaries posted
- Number of corporate culture reviews
- Ratings for each category
- Number of reviews and scores by applicants
- Number of reviews and scores by employees

These require scraping three different pages:
- Main Page: https://www.kununu.com/de/[company name] 
- Total Views: https://www.kununu.com/middlewares/profiles/+[company uuid]+/statistics 
- Applicant Reviews: https://www.kununu.com/de/[company name]/bewerbung
- Employee Reviews: https://www.kununu.com/de/[company name]/kommentare

**Important Note**: This code works as of July 12th, 2024. Kununu may change their website structure, which would require updating the code.
In particular, it is likely that the CLASS_IDS dictionary will need to be updated.

In [2]:
# !pip install requests beautifulsoup4 pandas numpy python-dotenv

from urllib.parse import urlencode
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from utils import *
from dotenv import load_dotenv
import os

# load_dotenv() # make sure to have a .env file that defines the variable 'SCRAPINGBEE_API_KEY' if using scrapingbee

In [14]:
CONCURRENCY = 10 # Number of concurrent requests to make when scraping
CLASS_IDS = {
    "firm_name": "index__title__0q4vx h3-semibold",
    "percent_recommend": "index__value__o0UJI h2 h3-semibold-tablet",
    "overall_rating": "index__value__ApL+4 h2 h3-semibold-tablet",
    "tabs": "index__tabs__lGVpv",
    "factor_score": "^index__factorScore",
    "total_reviews": "index__totalReviews__aUzS6 p-small-semibold",
    "aggregation": "index__aggregation__NhXCC index__center__K0n3a",
    "employee_score": "h3-semibold index__score__BktQY",
    "employee_recommendation": "index__recommendation__LS0nx"
}

# importing all Kununu links
pwd = os.getcwd()
with open(pwd+"/data/all_kununu_company_profile_links.txt", "r") as file:
    FileContent = file.read()
all_kununu_links = FileContent.split("\n")

In [5]:
def main_page_scrape(url: str) -> dict:
    """
    Takes in a URL and returns a dictionary with all the information scraped from the URL to ratings_list.
    Information collected:
    - Firm name
    - Firm uuid
    - Number of views
    - Overall rating
    - Percentage of people who would recommend the firm
    - Total number of reviews
    - Number of salaries posted
    - Number of corporate culture reviews
    - Ratings for each category
    """
    result_dict = {}
    soup = soup_from_url(url)

    result_dict["firm_name"] = soup.find(class_=CLASS_IDS["firm_name"]).text.replace(",", ".").replace('\xa0', ' ') if soup.find(class_=CLASS_IDS["firm_name"]) else None
    result_dict["url"] = url
    result_dict["uuid"] = str(soup).split('"uuid":"')[1].split('"')[0] if '"uuid":"' in str(soup) else None

    try:
        num_views = requests.get(f"https://www.kununu.com/middlewares/profiles/{result_dict['uuid']}/statistics").text
        result_dict["views_num"] = int(num_views.split('"totalViews":')[1].split(',')[0])
    except:
        result_dict["views_num"] = np.nan

    result_dict["percent_recommend_overall"] = int(soup.find(class_=CLASS_IDS["percent_recommend"]).text.replace(".", "").replace(",", "").replace("%", "")) if soup.find(class_=CLASS_IDS["percent_recommend"]) else np.nan
    result_dict["overall_rating"] = float(soup.find(class_=CLASS_IDS["overall_rating"]).text.replace(",", ".")) if soup.find(class_=CLASS_IDS["overall_rating"]) else np.nan

    num_revs_soup = re.findall(r'\(.*?\)', str(soup.find(class_=CLASS_IDS["tabs"]).text)) if soup.find(class_=CLASS_IDS["tabs"]) else None
    if num_revs_soup:
        num_revs_ls = [int(x.replace("(", "").replace(")", "").replace(".", "")) for x in num_revs_soup]
        result_dict["total_reviews_num"] = num_revs_ls[0]
        result_dict["salaries_posted_num"] = num_revs_ls[1] if len(num_revs_ls) > 1 else np.nan
        result_dict["corporate_culture_review_num"] = num_revs_ls[2] if len(num_revs_ls) > 2 else np.nan
    else:
        result_dict["total_reviews_num"], result_dict["salaries_posted_num"], result_dict["corporate_culture_review_num"] = np.nan, np.nan, np.nan

    ratings_raw = [x.parent.text for x in soup.find_all(class_=re.compile(CLASS_IDS["factor_score"]))]
    categories = [rating[3:] for rating in ratings_raw]
    ratings = [float(str(rating[:3].replace(",", "."))) for rating in ratings_raw]
    result_dict.update(dict(zip(categories, ratings)))

    return result_dict

def get_applicant_info(url: str) -> dict:
    """
    Takes in a url and returns the review scores and number of reviews by applicants to the company,
    separated by the following categories: "hired", "rejected", "offerDeclined", "deferred".
    """
    application_outcomes = ["hired", "rejected", "offerDeclined", "deferred"]
    reviews_by_applicants = {}

    for outcome in ["all_applicants"] + application_outcomes:
        soup = soup_from_url(f"{url}/bewerbung{'?result=' + outcome if outcome != 'all_applicants' else ''}")
        try:
            reviews_by_applicants[f"{outcome}_review_num"] = int(soup.find(class_=CLASS_IDS["total_reviews"]).text.split(" ")[0])
            reviews_by_applicants[f"{outcome}_review_score"] = float(soup.find(class_=CLASS_IDS["aggregation"]).text[:3].replace(",", "."))
        except:
            reviews_by_applicants[f"{outcome}_review_num"] = np.nan
            reviews_by_applicants[f"{outcome}_review_score"] = np.nan

    return reviews_by_applicants

def get_employee_info(url: str) -> dict:
    """
    Takes in a url and returns the review scores, number of reviews by employees to the company,
    and percent of employees that would recommend the company. 
    """
    reviews_by_employees = {}
    soup = soup_from_url(f"{url}/kommentare")

    try:
        reviews_by_employees["employees_review_num"] = int(soup.find(class_=CLASS_IDS["total_reviews"]).text.split(" ")[0].replace(".", ""))
        reviews_by_employees["employee_review_score"] = float(soup.find(class_=CLASS_IDS["employee_score"]).text[:3].replace(",", "."))
        reviews_by_employees["employee_rec_score"] = int(soup.find(class_=CLASS_IDS["employee_recommendation"]).text.split("%")[0])
    except:
        reviews_by_employees["employees_review_num"] = np.nan
        reviews_by_employees["employee_review_score"] = np.nan
        reviews_by_employees["employee_rec_score"] = np.nan

    return reviews_by_employees

def get_all_info(url: str) -> dict:
    """
    Takes in a url and returns all the information scraped from the URL to ratings_list.
    Information collected:
    - Firm name
    - Firm uuid
    - Number of views
    - Overall rating
    - Percentage of people who would recommend the firm
    - Total number of reviews
    - Number of salaries posted
    - Number of corporate culture reviews
    - Ratings for each category
    - Number of reviews and scores by applicants
    - Number of reviews and scores by employees
    """
    result_dict = main_page_scrape(url)
    result_dict.update(get_applicant_info(url))
    result_dict.update(get_employee_info(url))
    return result_dict

In [6]:
column_name_mapping = {
    'firm_name': 'firm_name',
    'url': 'kn_url',
    'uuid': 'uuid',
    'views_num': 'kn_views_num',
    'percent_recommend_overall': 'kn_employee_rec_score',
    'overall_rating': 'kn_overall',
    'total_reviews_num': 'kn_total_reviews_num',
    'salaries_posted_num': 'kn_salaries_posted_num',
    'gehaltsozialleistungen': 'kn_salary_benefits',
    'image': 'kn_image',
    'karriereweiterbildung': 'kn_career_development',
    'arbeitsatmosphare': 'kn_work_atmosphere',
    'kommunikation': 'kn_communication',
    'kollegenzusammenhalt': 'kn_colleague_cohesion',
    'work_life_balance': 'kn_work_life_balance',
    'vorgesetztenverhalten': 'kn_superiors_behavior',
    'interessante aufgaben': 'kn_interesting_tasks',
    'arbeitsbedingungen': 'kn_working_conditions',
    'umwelt_sozialbewusstsein': 'kn_environment_social_awareness',
    'gleichberechtigung': 'kn_equal_rights',
    'umgang mit alteren kollegen': 'kn_dealing_with_older_colleagues',
    'all_applicants_review_num': 'kn_all_applicants_review_num',
    'all_applicants_review_score': 'kn_all_applicants_review_score',
    'hired_review_num': 'kn_hired_review_num',
    'hired_review_score': 'kn_hired_score',
    'rejected_review_num': 'kn_rejected_review_num',
    'rejected_review_score': 'kn_rejected_score',
    'offerdeclined_review_num': 'kn_offer_declined_review_num',
    'offerdeclined_review_score': 'kn_offer_declined_score',
    'deferred_review_num': 'kn_deferred_review_num',
    'deferred_review_score': 'kn_deferred_score',
    'employees_review_num': 'kn_employees_review_num',
    'employee_review_score': 'kn_employee_review_score',
    'employee_rec_score': 'kn_employee_rec_score',
    'corporate_culture_review_num': 'kn_corporate_culture_review_num'
}

In [None]:
# Scraping all data in parallel and saving to csv
window_size = 5000
for i in range(0, len(all_kununu_links)//window_size+1):
    concurrency = CONCURRENCY
    pool = ThreadPool(concurrency)
    ratings_list = pool.map(get_all_info, all_kununu_links[i*window_size:min((i+1)*window_size, len(all_kununu_links))])
    pool.close()
    pool.join()

    df = pd.DataFrame(ratings_list)
    df.columns = [s.replace("/", "s").replace("-","_").replace("ä", "a").lower() for s in df.columns]
    df.rename(columns=column_name_mapping, inplace=True)
    df.to_csv(f"{pwd}/data/kununu_data_{i+1}.csv", index=False)
    print(f"Saved results to data/kununu_data_{i+1}.csv")

# Consolidate all results
df = pd.concat([pd.read_csv(f"{pwd}/data/kununu_data_{i+1}.csv") for i in range(0, len(all_kununu_links)//window_size+1)], ignore_index=True)
df.to_csv(f"{pwd}/data/all_scraped_kununu_data.csv", index=False)
print("Saved results to data/all_scraped_kununu_data.csv")