# 🧠 Job Extraction Pipeline with Cleaning

### 📥 Step 1: Load libraries and data

In [None]:
import pandas as pd
import re
from dateutil import parser

# Load the dataset
# Replace with your actual file name if different
df = pd.read_csv('extracted_job_texts.csv')

### 🧹 Step 2: Clean the raw text

In [None]:

def clean_text(text):
    text = re.sub(r'(CamScanner|Page\s\d+|\s*@\s*)', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

### 📌 Step 3: Define extraction functions

In [None]:

def extract_organization(text):
    known_orgs = [
        "Save the Children", "ZOA", "UNICEF", "UNHCR", "WFP", "IRC", "CARE", "World Vision", "Medair", "ACTED",
        "Plan International", "Danish Refugee Council", "Oxfam", "CAFOD", "Caritas", "ADRA", "Cordaid",
        "Concern Worldwide", "Norwegian Refugee Council", "CTG", "GOAL", "GIZ", "Mercy Corps", "INTERSOS",
        "Catholic Relief Services", "UNDP", "FAO", "WHO", "MSF", "IOM", "ACF", "War Child"
    ]
    lines = text.strip().split('\n')[:30]
    for line in lines:
        for org in known_orgs:
            if org.lower() in line.lower():
                return org
    for line in lines:
        match = re.search(r'(Organization|Published by|By|Employer)[:\-]?\s*([A-Z][A-Za-z &,.]{3,})', line)
        if match:
            candidate = match.group(2).strip()
            if not re.search(r'\d', candidate) and len(candidate.split()) <= 6:
                return candidate
    for line in lines:
        if 2 <= len(line.split()) <= 5 and line.istitle() and not re.search(r'\d', line):
            return line.strip()
    return "Unknown"

def extract_job_title(text):
    lines = text.strip().split('\n')[:30]
    patterns = [
        r'(job title|position|title|vacancy)[:\-]?\s*(.+)',
        r'we are looking for[:\-]?\s*(.+)',
        r'role[:\-]?\s*(.+)',
        r'job opening[:\-]?\s*(.+)',
        r'recruiting[:\-]?\s*(.+)'
    ]
    for line in lines:
        for pattern in patterns:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                title = match.group(2).strip()
                title = re.sub(r'[^a-zA-Z0-9 \-_/()]', '', title)
                if 3 < len(title) < 100:
                    return title
    return "Unknown"

def extract_skills(text):
    keywords = [
        "python", "java", "excel", "word", "powerpoint", "sql", "r", "gis", "arcgis",
        "monitoring", "evaluation", "communication", "leadership", "report writing",
        "project management", "budgeting", "data analysis", "problem solving", "teamwork",
        "procurement", "logistics", "negotiation", "networking", "supervision",
        "customer service", "human resources", "training", "coaching", "facilitation",
        "presentation", "graphic design", "research", "documentation", "compliance"
    ]
    found = sorted(set([kw for kw in keywords if re.search(rf'\\b{kw}\\b', text, re.IGNORECASE)]))
    return ", ".join(found) if found else "Not Found"

def extract_sector(text):
    sectors = {
        "Education": ["school", "education", "teacher", "training"],
        "Health": ["health", "clinic", "medical", "nutrition", "hiv", "malaria"],
        "Humanitarian": ["ngo", "unicef", "humanitarian", "relief", "emergency", "refugee"],
        "Agriculture": ["farm", "agriculture", "livestock", "crop"],
        "Logistics": ["logistics", "transport", "fleet", "supply chain"],
        "Finance": ["finance", "accounting", "audit", "budget", "grants", "payroll"],
        "WASH": ["water", "sanitation", "hygiene"],
        "Protection": ["protection", "gender", "child protection", "gbv"],
        "ICT": ["ict", "information technology", "systems", "database", "network"]
    }
    for sector, keywords in sectors.items():
        for kw in keywords:
            if re.search(rf'\\b{kw}\\b', text, re.IGNORECASE):
                return sector
    return "Unknown"

def extract_location(text):
    known_places = [
        "Juba", "Wau", "Malakal", "Bor", "Yambio", "Rumbek", "Aweil", "Yei", "Torit",
        "Bentiu", "Terekeeka", "Kapoeta", "Maridi", "Tonj", "Abyei", "Pibor", "Akobo",
        "Leer", "Renk", "Kodok", "Paloich", "Tali", "Magwi", "Koch", "Pariang"
    ]
    for place in known_places:
        if re.search(rf'\\b{place}\\b', text, re.IGNORECASE):
            return place
    match = re.search(r'(location|duty station|based in|workplace)[:\-]?\s*([^\n\r\.,]+)', text, re.IGNORECASE)
    return match.group(2).strip() if match else "Unknown"

def extract_posting_date(text):
    patterns = [
        r'\b(\d{1,2}\s+\w+\s+\d{4})\b',
        r'\b(\w+\s+\d{1,2},\s+\d{4})\b',
        r'\b(\d{4}-\d{2}-\d{2})\b'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
    return "Unknown"

def normalize_date(date_str):
    try:
        return parser.parse(date_str, fuzzy=True, dayfirst=True).strftime("%Y-%m-%d")
    except:
        return "Unknown"

def classify_job_type(text):
    job_types = {
        "Full-time": ["full time", "permanent", "long-term", "staff position"],
        "Part-time": ["part time", "temporary", "short-term", "casual"],
        "Consultancy": ["consultant", "consultancy", "contract basis", "individual contractor"],
        "Internship": ["intern", "internship", "trainee"],
        "Volunteer": ["volunteer", "voluntary service"]
    }
    for jtype, keywords in job_types.items():
        for kw in keywords:
            if re.search(rf'\\b{kw}\\b', text, re.IGNORECASE):
                return jtype
    return "Unclassified"


### 🔎 Step 4: Apply extraction to the cleaned text

In [None]:

df["organization"] = df["cleaned_text"].apply(extract_organization)
df["job_title"] = df["cleaned_text"].apply(extract_job_title)
df["skills"] = df["cleaned_text"].apply(extract_skills)
df["sector"] = df["cleaned_text"].apply(extract_sector)
df["location"] = df["cleaned_text"].apply(extract_location)
df["posting_date"] = df["cleaned_text"].apply(extract_posting_date).apply(normalize_date)
df["job_type"] = df["cleaned_text"].apply(classify_job_type)

### 💾 Step 5: Export cleaned data

In [None]:

output = df[[
    "filename", "organization", "job_title", "skills", "sector",
    "location", "posting_date", "job_type"
]]
output.to_csv("final_cleaned_job_output.csv", index=False)
print("Export complete.")

Export complete.
