# import

In [None]:
# --- IMPORT LIBRARY ---
import pandas as pd
import time
import re
import random
import requests
import time
from tqdm import tqdm
from datetime import datetime
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
from urllib.parse import quote

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# scrape company list from glints

In [None]:
# === STEP 1: SCRAPE COMPANY LIST FROM GLINTS ===
options = Options()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)

company_data = []

for page in range(1, 2):
    try:
        print(f"Scraping page {page}...")
        url = f'https://glints.com/id/companies?countries=ID&page={page}'
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a[aria-label^="Company card"]'))
        )

        companies = driver.find_elements(By.CSS_SELECTOR, 'a[aria-label^="Company card"]')

        for comp in companies:
            try:
                name = comp.find_element(By.CSS_SELECTOR, 'p[aria-label$="name"]').text
            except:
                name = ''
            try:
                location = comp.find_element(By.CSS_SELECTOR, 'p[class*="LocationName"]').text
            except:
                location = ''
            try:
                industry = comp.find_element(By.CSS_SELECTOR, 'p[class*="IndustryName"]').text
            except:
                industry = ''
            try:
                inner_divs = comp.find_elements(By.CSS_SELECTOR, 'div[href]')
                profile_link = ''
                for div in inner_divs:
                    profile_link = div.get_attribute('href')
                    if profile_link:
                        break
            except:
                profile_link = ''

            company_data.append({
                'Name': name,
                'Location': location,
                'Industry': industry,
                'Full URL': profile_link
            })

        time.sleep(2)

    except Exception as e:
        print(f"Gagal scrape page {page}: {e}")
        continue

df = pd.DataFrame(company_data)
driver.quit()

# === STEP 2: CLEAN EMPTY NAMES ===
df = df[df['Name'] != ''].reset_index(drop=True)
df.to_csv('glints_company_list.csv', index=False)


In [None]:
df['Full URL'] = 'https://glints.com' + df['Full URL']


In [None]:
df.head()

# pencarian addres

In [None]:
# === STEP 3: SCRAPE ADDRESS FROM COMPANY PROFILE ===
def scrape_address(url):
    try:
        service = Service(r"")
        options = Options()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--start-minimized')
        options.add_argument("--window-position=-32000,-32000")
        driver = webdriver.Chrome(service=service, options=options)

        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "p")))

        all_p = driver.find_elements(By.TAG_NAME, "p")
        for i, p in enumerate(all_p):
            if "Alamat" in p.text:
                if i + 1 < len(all_p):
                    address_text = all_p[i + 1].text
                    driver.quit()
                    return address_text

        driver.quit()
        return ""
    except Exception as e:
        print(f"Error on {url}: {e}")
        return ""



In [None]:
df["Address"] = ""

for i, row in df.iterrows():
    if pd.notna(row['Full URL']) and (pd.isna(row['Address']) or row['Address'].strip() == ""):
        print(f"Scraping address for: {row['Name']}")
        address = scrape_address(row['Full URL'])
        df.at[i, 'Address'] = address

print("Alamat selesai dicari.")


In [None]:
df.head()

# pencarian sosmed

In [None]:
# === STEP 4: SCRAPE SOCIAL LINKS ===
def scrape_social_links(url):
    try:
        service = Service(r"")
        options = Options()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument("--window-position=-32000,-32000")
        driver = webdriver.Chrome(service=service, options=options)

        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))

        social_links = {
            "Website": None,
            "Instagram": None,
            "LinkedIn": None,
            "X": None
        }

        anchors = driver.find_elements(By.TAG_NAME, "a")
        for a in anchors:
            label = a.text.strip().lower()
            href = a.get_attribute("href")

            if "website" in label:
                social_links["Website"] = href
            elif "instagram" in label:
                social_links["Instagram"] = href
            elif "linkedin" in label:
                social_links["LinkedIn"] = href
            elif label in ["x", "twitter"]:
                social_links["X"] = href

        driver.quit()
        return social_links

    except Exception as e:
        print(f"Error on {url}: {e}")
        return {
            "Website": "NaN",
            "Instagram": "NaN",
            "LinkedIn": "NaN",
            "X": "NaN"
        }


In [None]:
df["Website"] = ""
df["Instagram"] = ""
df["LinkedIn"] = ""
df["X"] = ""

for idx, row in df.iterrows():
    if pd.notna(row["Full URL"]):
        links = scrape_social_links(row["Full URL"])
        df.at[idx, "Website"] = links["Website"]
        df.at[idx, "Instagram"] = links["Instagram"]
        df.at[idx, "LinkedIn"] = links["LinkedIn"]
        df.at[idx, "X"] = links["X"]

df.to_csv("glints_companies_with_socials.csv", index=False)

In [None]:
df.head()

# pencarian kontak

##  pencarian nomor telp di yellowspage

mencari profile company dalam website yellowspage

In [None]:
# === STEP 5: YELLOWPAGES - GET PROFILE LINK ===
def get_yellowpages_link(nama_perusahaan, max_retries=3):
    try:
        for attempt in range(max_retries):
            time.sleep(random.uniform(10, 20))
            nama_encoded = quote(nama_perusahaan)
            url = f"https://www.yellowpages.id/listing/places/?q={nama_encoded}"
            headers = {"User-Agent": "Mozilla/5.0"}

            response = requests.get(url, headers=headers, timeout=20)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, "html.parser")
            hasil_cards = soup.find_all("div", class_="mdc-card mdc-card--outlined")
            if not hasil_cards:
                return None

            for card in hasil_cards:
                title_tag = card.find("h2", class_="card__title mdc-typography--headline6")
                if title_tag:
                    nama_ditemukan = title_tag.text.strip()
                    similarity = SequenceMatcher(None, nama_perusahaan.lower(), nama_ditemukan.lower()).ratio()
                    if similarity > 0.8:
                        link_tag = title_tag.find("a")
                        if link_tag:
                            return link_tag.get("href")
        return None
    except Exception as e:
        print(f"Error yellowpages: {e}")
        return None


In [None]:
tqdm.pandas() 
df["YellowPages Link"] = df["Name"].progress_apply(get_yellowpages_link)

In [None]:
df.head()

### mencari nomor telp dari masing-masing link yellowspage yang tersedia

In [None]:
# === STEP 6: GET PHONE NUMBER FROM YELLOWPAGES PROFILE ===
service = Service(r"XXXXXXXXX")
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)

df["Phone Number"] = ""

for idx, row in df.iterrows():
    link = row['YellowPages Link']
    if pd.isna(link) or link.strip() == '':
        continue

    try:
        driver.get(link)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        phone_buttons = soup.find_all('button', class_='phone')

        for button in phone_buttons:
            number = button.get('data-number')
            if number and number.startswith('+62'):
                df.at[idx, 'Phone Number'] = number
                break

    except Exception as e:
        print(f"Error at index {idx}: {e}")
        continue

driver.quit()


In [None]:
df.head()

## pencarian email

### pencarian email dari website company

In [None]:
# === STEP 7: SCRAPE EMAIL & PHONE FROM WEBSITE CONTACT PAGE ===
def extract_emails_from_website(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            html = response.text
            emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', html)
            return ', '.join(set(emails)) if emails else None
    except:
        return None

# Terapkan ke kolom Website
df["Scraped_Email"] = df["Website"].apply(extract_emails_from_website)


In [None]:
df.head()

In [None]:
import pandas as pd

def filter_clean_email_list(email_str):
    if pd.isna(email_str):
        return None
    
    # Pisahkan berdasarkan koma
    email_list = email_str.split(',')

    clean_list = []
    for email in email_list:
        email = email.strip().lower()
        
        # Cek apakah benar-benar email yang manusiawi
        if (
            '@' in email and
            email.endswith(('.com', '.co.id', '.id', '.org', '.net')) and
            not any(x in email for x in [
                'sentry', 'your-email', 'example', '@3', '@4', '@5',
                'sweetalert', 'core-js', 'lodash', 'bootstrap', 'jquery', 'polyfill', 'react'
            ])
        ):
            clean_list.append(email)
    
    return ', '.join(set(clean_list)) if clean_list else None


In [None]:
df['Cleaned_Email'] = df['Scraped_Email'].apply(filter_clean_email_list)


In [None]:
#pencarian dengan serpapi

API_KEY = "XXXXXX"
search_url = "https://serpapi.com/search"

# Baca data


# Pastikan kolom baru untuk menyimpan hasil URL
df["Scraped_URL"] = df.get("Scraped_URL", "")

# Loop hanya untuk yang Cleaned_Email masih kosong
for idx, row in df[df["Cleaned_Email"].isnull()].iterrows():
    website = row["Website"]
    if pd.isnull(website) or website.strip() == "":
        continue

    domain = website.replace("https://", "").replace("http://", "").split("/")[0]
    query = f"site:{domain} contact OR hubungi OR kontak OR call"

    params = {
        "q": query,
        "hl": "id",
        "api_key": API_KEY,
        "num": 5
    }

    try:
        response = requests.get(search_url, params=params, timeout=10)
        result_json = response.json()

        if "organic_results" in result_json:
            for res in result_json["organic_results"]:
                link = res.get("link", "")
                if domain in link:
                    df.at[idx, "Scraped_URL"] = link
                    print(f"[{idx}] {domain} ✅ {link}")
                    break
            else:
                print(f"[{idx}] {domain} ❌ Tidak ada hasil relevan")
        else:
            print(f"[{idx}] {domain} ⚠️ Tidak ada hasil dari SerpAPI")

        time.sleep(1.5)  # jeda agar tidak cepat limit

    except Exception as e:
        print(f"[{idx}] ⚠️ Error: {e}")
        continue

# # Simpan hasilnya
# df.to_csv("scraped_contact_url_only.csv", index=False)


In [None]:
start_index = 0
# Pastikan kolom Scraped_Email & Scraped_Phone sudah ada
if "Scraped_Email" not in df.columns:
    df["Scraped_Email"] = ""
if "Scraped_Phone" not in df.columns:
    df["Scraped_Phone"] = ""

# Lanjutkan hanya dari baris yang sudah memiliki Scraped_URL dan index >= 300
contact_df = df[
    (df.index >= start_index) &
    df["Scraped_URL"].notnull() &
    df["Cleaned_Email"].isnull() &
    (df["Scraped_URL"] != "")
]

for idx, row in contact_df.iterrows():
    url = row["Scraped_URL"]

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator=" ", strip=True)

        # Cari semua email valid
        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)

        # Cari nomor telepon (bisa diawali dengan +62, 62, atau 08)
        phones = re.findall(r"\b(?:\+62|62|08)[0-9]{7,13}\b", text)

        # Bersihkan duplikat dan simpan
        email_result = ", ".join(sorted(set(emails)))
        phone_result = ", ".join(sorted(set(phones)))

        df.at[idx, "Scraped_Email"] = email_result
        df.at[idx, "Scraped_Phone"] = phone_result

        print(f"[{idx}] ✅ Email: {email_result} | Phone: {phone_result}")
        time.sleep(1.5)

    except Exception as e:
        print(f"[{idx}] ⚠️ Gagal akses {url}: {e}")
        continue

In [None]:
df.head()


In [None]:
def extract_valid_emails(text, existing_value=None):
    # Jika sudah ada nilai di Cleaned_Email, skip (return tetap yang lama)
    if pd.notna(existing_value) and str(existing_value).strip() != "":
        return existing_value  # tidak perlu ubah

    if pd.isna(text) or str(text).strip() == "":
        return None

    # Cari semua pola email valid
    candidates = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)

    clean_list = []
    blacklist_partial = [
        'sentry', 'sweetalert', 'core-js', 'lodash', 'bootstrap', 'jquery', 
        'polyfill', 'react', 'popper', 'carousel', 'slick', 'cdn', 'cloudflare',
        'placeholder', 'example', 'youremail', 'your-email', 'no-reply', 'noreply'
    ]

    for email in candidates:
        lower_email = email.lower()
        if not any(bad in lower_email for bad in blacklist_partial):
            clean_list.append(lower_email)

    return ', '.join(sorted(set(clean_list))) if clean_list else None


In [None]:
df["Cleaned_Email"] = df.apply(
    lambda row: extract_valid_emails(row["Scraped_Email"], row["Cleaned_Email"]),
    axis=1
)


In [None]:
df.head()

## kontak email dan phone

In [None]:
start_index = 0

# Siapkan Selenium
service = Service(r"")
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)

# Halaman turunan yang akan dicoba
contact_paths = [
    '', 'contact', 'contact-us', 'kontak', 'kontak-kami', 'hubungi', 
    'hubungi-kami', 'about', 'about-us', 'tentang', 'tentang-kami'
]

# Fungsi ambil email & phone
def extract_email_phone(text):
    emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
    phones = re.findall(r"\+?\d[\d\s\-().]{7,}\d", text)
    return ", ".join(set(emails)), ", ".join(set(phones))

# Scraping loop
for index, row in df.iterrows():
    if index < start_index:
        continue  # Lewati sampai index yang diinginkan

    if pd.notna(row['Cleaned_Email']):
        continue  # Skip kalau sudah punya email

    base_url = str(row['Website']).strip()
    if base_url == 'nan' or base_url == '':
        continue

    found_email = ""
    found_phone = ""

    for path in contact_paths:
        url = base_url.rstrip('/') + '/' + path.lstrip('/')
        print(f"[{index}] Scraping: {url}")

        try:
            driver.get(url)
            time.sleep(3)
            page_text = driver.find_element(By.TAG_NAME, "body").text
            email, phone = extract_email_phone(page_text)

            if email:
                found_email = email
            if phone:
                found_phone = phone

            if found_email or found_phone:
                break  # cukup satu halaman yang berhasil

        except Exception as e:
            print(f"[{index}] Skip {url}: {e}")
            continue

    df.at[index, 'Scraped_Email'] = found_email
    df.at[index, 'Scraped_Phone'] = found_phone
    
    # df.to_excel("scraped_autosave.xlsx", index=False)

# Selesai
driver.quit()

In [None]:
df.loc[:, "Cleaned_Email"] = df.apply(
    lambda row: extract_valid_emails(row["Scraped_Email"], row["Cleaned_Email"]),
    axis=1
)


In [None]:
df.info()

In [None]:
df.head(50)

In [None]:
df.to_csv("company_df.csv", index=False)

In [89]:
df.to_excel("company_df.xlsx", index=False)


# pencarian job search

In [None]:
df.columns

In [None]:

def scrape_jobs(name, url):
    try:
        service = Service(r"XXXXXXXXXXXXXXX")
        
        options = webdriver.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-position=10000,10000')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        driver = webdriver.Chrome(service=service, options=options)


        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, "jobs-section")))

        time.sleep(2)  # beri waktu render JS

        job_elements = driver.find_elements(By.CSS_SELECTOR, '[aria-label^="Job:"]')
        job_data = []

        for job in job_elements:
            try:
                title_elem = job.find_element(By.CSS_SELECTOR, 'h2 a')
                title = title_elem.text.strip()
                link = title_elem.get_attribute('href')

                location_parts = job.find_elements(By.CSS_SELECTOR, '.CardJobLocation__LocationSpan-sc-v7ofa9-1 a')
                location = ", ".join([l.text.strip() for l in location_parts])

                job_data.append({
                    "Company": name,
                    "Job Title": title,
                    "Job Link": link,
                    "Location": location
                })
            except Exception as e:
                print(f"Gagal parsing salah satu job di {url}: {e}")

        driver.quit()
        # return job_data
        # ✅ Hilangkan duplikat berdasarkan Job Link
        seen_links = set()
        unique_data = []

        for job in job_data:
            if job["Job Link"] not in seen_links:
                unique_data.append(job)
                seen_links.add(job["Job Link"])

        return unique_data

    except Exception as e:
        print(f"Error saat membuka {url}: {e}")
        return []

In [None]:
all_jobs = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    name = row['Name']
    url = row['Full URL']
    jobs = scrape_jobs(name, url)
    all_jobs.extend(jobs)


In [None]:
jobs_df = pd.DataFrame(all_jobs)

In [None]:
jobs_df.head()

# detail job search

In [None]:


def scrape_job_detail(job_url):
    try:
        service = Service(r"")
        options = webdriver.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-position=10000,10000')  # like invisible
        driver = webdriver.Chrome(service=service, options=options)

        driver.get(job_url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, "__next")))
        time.sleep(2)

        def safe_find(selector, multiple=False):
            try:
                if multiple:
                    return driver.find_elements(By.CSS_SELECTOR, selector)
                return driver.find_element(By.CSS_SELECTOR, selector)
            except:
                return [] if multiple else ""

        container = safe_find(".TopFoldsc__JobOverViewInfoContainer-sc-1fbktg5-8")

        # Gaji
        salary = ""
        gaji_div = safe_find(".TopFoldsc__BasicSalary-sc-1fbktg5-13")
        if gaji_div:
            salary = gaji_div.text.strip()

        # Kategori & Fungsi
        categories = safe_find(".TopFoldsc__JobOverViewInfo-sc-1fbktg5-9.larqhx a", multiple=True)
        job_category = categories[0].text.strip() if len(categories) > 0 else ""
        job_function = categories[1].text.strip() if len(categories) > 1 else ""

        # Tipe kerja & lokasi kerja
        location_divs = safe_find(".TopFoldsc__JobOverViewInfo-sc-1fbktg5-9.larqhx", multiple=True)
        job_type = ""
        work_place = ""
        if len(location_divs) > 2:
            texts = location_divs[2].text.split("·")
            job_type = texts[0].strip()
            work_place = texts[1].strip() if len(texts) > 1 else ""

        # Pendidikan & Pengalaman
        min_edu = location_divs[3].text.strip() if len(location_divs) > 3 else ""
        min_exp = location_divs[4].text.strip() if len(location_divs) > 4 else ""

        # Tanggal
        posted = ""
        updated = ""
        time_section = safe_find(".TopFoldsc__JobOverViewTime-sc-1fbktg5-11")
        if time_section:
            spans = time_section.find_elements(By.TAG_NAME, "span")
            if len(spans) >= 2:
                posted = spans[0].text.strip()
                updated = spans[1].text.strip()

        # Persyaratan
        req_tags = safe_find(".JobRequirementssc__Tag-sc-15g5po6-3", multiple=True)
        requirements = ", ".join([t.text.strip() for t in req_tags])

        # Skills
        skill_tags = safe_find(".SkillsLegacysc__TagOverride-sc-qq2t1c-3", multiple=True)
        skills = ", ".join([s.text.strip() for s in skill_tags])

        # Deskripsi
        desc_div = safe_find('.DraftjsReadersc__ContentContainer-sc-zm0o3p-0')
        job_desc = desc_div.text.strip() if desc_div else ""

        # Timestamp scrape
        timestamp_scraped = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        driver.quit()

        return {
            "Job Category": job_category,
            "Job Function": job_function,
            "Employment Type": job_type,
            "Work Place": work_place,
            "Min Education": min_edu,
            "Min Experience": min_exp,
            "Salary Range": salary,
            "Job Posted": posted,
            "Last Updated": updated,
            "Persyaratan": requirements,
            "Skills": skills,
            "Deskripsi": job_desc,
            "Timestamp_Scraped": timestamp_scraped,
        }

    except Exception as e:
        print(f"Error scraping {job_url}: {e}")
        return {}


In [None]:
results = []

for i, row in tqdm(jobs_df.iterrows(), total=len(jobs_df)):  # ✅ pakai jobs_df
    job_url = row['Job Link']
    print(f"Scraping ({i+1}/{len(jobs_df)}): {job_url}")
    detail = scrape_job_detail(job_url)
    detail['Job Link'] = job_url
    results.append(detail)


In [None]:
job_details_df = pd.DataFrame(results)

final_df_jobs = pd.merge(jobs_df, job_details_df, on="Job Link", how="left")

# Simpan ke file
final_df_jobs.to_csv("job_posts_with_details.csv", index=False)


In [None]:
final_df_jobs.head()

In [None]:
df.head()