In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from urllib.parse import urlparse, parse_qs, urljoin

# --- CONFIGURATION ---
BASE_URL = "https://scholar.google.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7"
}

# --- LIST DOSEN ---
# Gabungin semua di sini aja biar gak kerja dua kali
DOSEN_LIST = [
    {"nama": "Dr. Ir. Agus Adhi Nugroho, MT", "bidang": "Teknik Elektro", "user_id": "OfqdgBkAAAAJ"},
    {"nama": "Jenny Putri Hapsari, ST, MT", "bidang": "Teknik Elektro", "user_id": "wDqwEl0AAAAJ"},
    {"nama": "Munaf Ismail, ST, MT", "bidang": "Teknik Elektro", "user_id": "BNKRU4oAAAAJ"},
    {"nama": "Dr. Gunawan, ST, MT", "bidang": "Teknik Elektro", "user_id": "n2YiDYMAAAAJ"},
    {"nama": "Prof. Dr. Ir. Muhamad Hadddin, MT", "bidang": "Teknik Elektro", "user_id": "RuVN8JsAAAAJ"},
    {"nama": "Agus Suprajitno, ST, MT", "bidang": "Teknik Elektro", "user_id": "DOyH7GYAAAAJ"},
    {"nama": "Dr. Eka Nuryanto BS, ST, MT", "bidang": "Teknik Elektro", "user_id": "hpT9eAUAAAAJ"},
    {"nama": "Dr. Bustanul Arifin, ST, MT", "bidang": "Teknik Elektro", "user_id": "kxQ1hjkAAAAJ"},
    {"nama": "Prof. Dr. Sri Artini Dwi Prasetyowati, MSi", "bidang": "Teknik Elektro", "user_id": "MePXtmAAAAAJ"},
    {"nama": "Ir. Arief Marwanto, S.T., M.Eng., Ph.D., IPM", "bidang": "Teknik Elektro", "user_id": "5X_2CsgAAAAJ"},
    {"nama": "Andi Riansyah, ST, M.Kom.", "bidang": "Teknik Informatika", "user_id": "1CWSXJsAAAAJ"},
    {"nama": "Moch. Taufik, ST, MIT.", "bidang": "Teknik Informatika", "user_id": "XB1ZSXwAAAAJ"},
    {"nama": "Dedy Kurniadi, ST, M.Kom.", "bidang": "Teknik Informatika", "user_id": "L1UJvrIAAAAJ"},
    {"nama": "Sam Farisa CH, ST, M.Kom.", "bidang": "Teknik Informatika", "user_id": "CbgxJB8AAAAJ"},
    {"nama": "Badie’ah, ST, M.Kom.", "bidang": "Teknik Informatika", "user_id": "YiFEdIkAAAAJ"},
    {"nama": "Mustafa, ST, MM., M.Kom.", "bidang": "Teknik Informatika", "user_id": "ra_8KNEAAAAJ"},
    {"nama": "Ir. Sri Mulyono, M.Eng.", "bidang": "Teknik Informatika", "user_id": "kTkYTHMAAAAJ"},
    {"nama": "Bagus Satrio WP, S.Kom., M.Cs.", "bidang": "Teknik Informatika", "user_id": "MI2jIdgAAAAJ"},
    {"nama": "Imam Much Ibnu Subroto, ST., M.Sc., Ph.D.", "bidang": "Teknik Informatika", "user_id": "eo5Qe8IAAAAJ"},
    {"nama": "Ida Widihastuti", "bidang": "Teknik Informatika", "user_id": "rP7IWnEAAAAJ"},
]

def get_article_id_and_link(href):
    full_link = urljoin(BASE_URL, href)
    parsed = urlparse(full_link)
    qs = parse_qs(parsed.query)
    cit = qs.get("citation_for_view", [""])[0]
    article_id = cit.split(":")[-1] if cit else ""
    return article_id, full_link

def scrape_profile(session, user_id, nama_dosen, bidang):
    """Fungsi utama scraping per profil dengan session reuse."""
    cstart = 0
    pagesize = 100
    results = []

    while True:
        url = f"{BASE_URL}/citations?hl=id&user={user_id}&cstart={cstart}&pagesize={pagesize}"
        print(f"  Fetching: cstart={cstart}...")

        try:
            r = session.get(url, timeout=15)
            if r.status_code == 429:
                print("  !! Kena limit Google Scholar. Harus istirahat dulu nih..")
                return results # Balikin yang udah dapet aja

            r.raise_for_status()
        except Exception as e:
            print(f"  !! Error pas request: {e}")
            break

        soup = BeautifulSoup(r.text, "html.parser")
        rows = soup.select("tr.gsc_a_tr")

        if not rows:
            break

        for row in rows:
            title_tag = row.select_one("a.gsc_a_at")
            if not title_tag: continue

            title = title_tag.text.strip()
            href = title_tag.get("data-href") or title_tag.get("href", "")
            article_id, link_artikel = get_article_id_and_link(href)

            gray = row.select("div.gs_gray")
            authors = gray[0].text.strip() if len(gray) > 0 else ""
            nama_jurnal = gray[1].text.strip() if len(gray) > 1 else ""

            cit_tag = row.select_one("td.gsc_a_c a")
            citation = cit_tag.text.strip() if cit_tag else "0"

            year_tag = row.select_one("td.gsc_a_y span")
            tahun = year_tag.text.strip() if year_tag else ""

            results.append({
                "nama_dosen": nama_dosen,
                "bidang": bidang,
                "article_id": article_id,
                "judul": title,
                "authors": authors,
                "link_artikel": link_artikel,
                "nama_jurnal": nama_jurnal,
                "tahun": tahun,
                "citation": citation
            })

        # Cek tombol Next
        next_btn = soup.select_one("button#gsc_bpf_next")
        if not next_btn or "disabled" in next_btn.attrs:
            break

        cstart += pagesize
        # Delay tipis antar pagination (2-4 detik) biar aman
        time.sleep(random.uniform(2, 4))

    return results

def main():
    print("memulai scraping data...")
    all_data = []

    # Pake session biar efisien
    with requests.Session() as session:
        session.headers.update(HEADERS)

        for i, d in enumerate(DOSEN_LIST):
            print(f"[{i+1}/{len(DOSEN_LIST)}] Scraping: {d['nama']}")

            data = scrape_profile(session, d["user_id"], d["nama"], d["bidang"])
            all_data.extend(data)

            print(f"  memperoleh {len(data)} publikasi.")

            # Delay antar dosen (biar nggak dicurigai bot)
            if i < len(DOSEN_LIST) - 1:
                wait_time = random.uniform(5, 8)
                print(f"  Istirahat {wait_time:.1f} detik...\n")
                time.sleep(wait_time)

    # Save hasil
    if all_data:
        df = pd.DataFrame(all_data)
        filename = "dataset_dosen_all.csv"
        df.to_csv(filename, index=False)
        print(f"\nSelesai! Total {len(all_data)} data disimpan ke {filename}")
    else:
        print("\nWaduh, nggak ada data yang berhasil diambil.")

if __name__ == "__main__":
    main()

memulai scraping data...
[1/20] Scraping: Dr. Ir. Agus Adhi Nugroho, MT
  Fetching: cstart=0...
  memperoleh 47 publikasi.
  Istirahat 6.9 detik...

[2/20] Scraping: Jenny Putri Hapsari, ST, MT
  Fetching: cstart=0...
  memperoleh 41 publikasi.
  Istirahat 7.8 detik...

[3/20] Scraping: Munaf Ismail, ST, MT
  Fetching: cstart=0...
  memperoleh 39 publikasi.
  Istirahat 5.6 detik...

[4/20] Scraping: Dr. Gunawan, ST, MT
  Fetching: cstart=0...
  memperoleh 48 publikasi.
  Istirahat 6.4 detik...

[5/20] Scraping: Prof. Dr. Ir. Muhamad Hadddin, MT
  Fetching: cstart=0...
  memperoleh 100 publikasi.
  Istirahat 5.8 detik...

[6/20] Scraping: Agus Suprajitno, ST, MT
  Fetching: cstart=0...
  memperoleh 24 publikasi.
  Istirahat 6.8 detik...

[7/20] Scraping: Dr. Eka Nuryanto BS, ST, MT
  Fetching: cstart=0...
  memperoleh 59 publikasi.
  Istirahat 5.0 detik...

[8/20] Scraping: Dr. Bustanul Arifin, ST, MT
  Fetching: cstart=0...
  memperoleh 62 publikasi.
  Istirahat 5.1 detik...

[9/20] Sc