<a href="https://colab.research.google.com/github/Ayuathm/Job_market_Analysis_AI_SSD/blob/main/NGO_Job_Scraper_All_With_Attachments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🌍 South Sudan NGO Forum Job Scraper with Attachments (Full Corpus)
This notebook scrapes **all available job listings** from the [South Sudan NGO Forum](https://comms.southsudanngoforum.org/c/jobs/5), and **downloads all attachments (PDF/DOC/DOCX)** linked in each post.

It also mounts Google Drive for large data storage if needed.

In [1]:
# ✅ Mount Google Drive to save large results
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import pandas as pd
import time
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Setup
base_url = "https://comms.southsudanngoforum.org"
category_id = 5
api_url_template = f"{base_url}/c/jobs/{category_id}.json?page={{}}"
headers = {"User-Agent": "Mozilla/5.0"}
attachment_folder = "/content/drive/MyDrive/Job_posts"
os.makedirs(attachment_folder, exist_ok=True)

# Step 1: Scrape Job Listings
all_jobs = []
page = 0
max_pages = 9999  # very high value to ensure all are fetched
while page < max_pages:
    page += 1
    print(f"📄 Scraping page {page}...")
    url = api_url_template.format(page)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"❌ Failed to fetch page {page}: {response.status_code}")
        break
    data = response.json()
    topics = data.get("topic_list", {}).get("topics", [])
    if not topics:
        print("✅ No more topics found. Stopping.")
        break
    for topic in topics:
        title = topic["title"]
        slug = topic["slug"]
        topic_id = topic["id"]
        full_url = f"{base_url}/t/{slug}/{topic_id}"
        all_jobs.append({"Job Title": title, "Job Link": full_url})
    time.sleep(1)

# Save jobs list
df = pd.DataFrame(all_jobs)
csv_path = "/content/drive/MyDrive/NGO_Job_Posts.csv"
df.to_csv(csv_path, index=False)
print(f"✅ Saved {len(df)} job posts to: {csv_path}")

# Step 2: Download Attachments
for job in all_jobs:
    post_url = job["Job Link"]
    print(f"🔍 Checking {post_url}")
    try:
        r = requests.get(post_url, headers=headers)
        soup = BeautifulSoup(r.content, "html.parser")
        attachments = soup.find_all("a", href=True)
        for att in attachments:
            href = att["href"]
            if href.lower().endswith((".pdf", ".doc", ".docx")):
                file_url = urljoin(base_url, href)
                filename = file_url.split("/")[-1]
                file_path = os.path.join(attachment_folder, filename)
                if not os.path.exists(file_path):
                    print(f"📥 Downloading: {filename}")
                    file_resp = requests.get(file_url, stream=True)
                    with open(file_path, "wb") as f:
                        for chunk in file_resp.iter_content(chunk_size=8192):
                            f.write(chunk)
    except Exception as e:
        print(f"⚠️ Failed to fetch/download from {post_url}: {e}")
    time.sleep(1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
🔍 Checking https://comms.southsudanngoforum.org/t/nrc-south-sudan-job-announcement/63377
📥 Downloading: 580jtt0DKFWEpOBLuXNLkyifr7.pdf
📥 Downloading: 4v2u2m3INmbo4A13u319XcIFKN.pdf
📥 Downloading: gTPLztFKTfi3Xl26gl9y7JffPGR.pdf
🔍 Checking https://comms.southsudanngoforum.org/t/job-advert-finance-admin-manager-hope-restoration-south-sudan/63361
📥 Downloading: csG4GUstCt6eQyYa1yDGsHZorOi.pdf
📥 Downloading: qpvAVEhkTZB0gLi6o4e56hzHuO2.pdf
🔍 Checking https://comms.southsudanngoforum.org/t/advert-for-executive-director/63375
📥 Downloading: r2MoQMg0MdAozlOb2T9PD9ZdZ50.pdf
🔍 Checking https://comms.southsudanngoforum.org/t/external-advert-commodity-tracking-and-reporting-team-leader/63374
📥 Downloading: x9pffDF80SYj9HNA4j4TryNAINw.pdf
📥 Downloading: 6sm9EdtJZZLy9EUEcRNS4lS81pD.doc
🔍 Checking https://comms.southsudanngoforum.org/t/human-resources-officer-eye-media/63373
📥 Downloading: 9QcAhq3Kz8IrFQpqpjq0LuNMP9q.pdf
🔍 Checking htt