<a href="https://colab.research.google.com/github/ConradKatlegoMogane/DPSA_Circular_data_mining/blob/main/Government_Circulars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ------------------------------------------------------------
# Step 0: Install required libraries
# ------------------------------------------------------------
# PyMuPDF (fitz) → PDF text extraction
# pandas → tabular storage and analysis
%pip install pymupdf pandas


Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6


In [2]:
# ------------------------------------------------------------
# Step 0.1: Import libraries
# ------------------------------------------------------------
import fitz        # PyMuPDF for PDF handling
import re          # Regular expressions for text parsing
import pandas as pd  # Tabular data manipulation


In [3]:
# ------------------------------------------------------------
# Step 1: Load the PDF document and extract text
# ------------------------------------------------------------
doc = fitz.open(r"/content/drive/MyDrive/DPSA_cIRCULARS/PSV CIRCULAR 45 OF 2025.pdf")

# Concatenate text from all pages
text = ""
for page in doc:
    text += page.get_text()

doc.close()  # Always close the document after extraction


In [4]:
# ------------------------------------------------------------
# Step 2: Split text into individual job posts
# ------------------------------------------------------------
# Each post begins with "POST <number>/<year>:"
posts = re.split(r"\nPOST\s+\d+/\d+\s*:\s*", text)[1:]  # skip header


In [5]:
# ------------------------------------------------------------
# Step 3: Extract fields from each post using regex
# ------------------------------------------------------------
data = []
for post_text in posts:
    try:
        # Job title (first line)
        post_match = re.search(r"^(.*?)\n", post_text)

        # Centre
        centre_match = re.search(r"CENTRE\s*:\s*(.*)", post_text)

        # Salary or Stipend
        salary_match = re.search(r"(?:SALARY|STIPEND)\s*:\s*(.*)", post_text)

        # Requirements (until "DUTIES")
        requirements_match = re.search(r"REQUIREMENTS\s*:\s*(.*?)(?:DUTIES\s*:)", post_text, re.DOTALL)

        # Duties (until "ENQUIRIES")
        duties_match = re.search(r"DUTIES\s*:\s*(.*?)(?:ENQUIRIES\s*:)", post_text, re.DOTALL)

        # Enquiries
        enquiries_match = re.search(r"ENQUIRIES\s*:\s*(.*)", post_text)

        # Closing Date (applies to entire document)
        closing_match = re.search(r"CLOSING DATE\s*:\s*(.*)", text)

        # Clean and store values
        post = post_match.group(1).strip() if post_match else ""
        centre = centre_match.group(1).strip() if centre_match else ""
        salary = salary_match.group(1).strip() if salary_match else ""
        requirements = requirements_match.group(1).strip().replace("\n", " ") if requirements_match else ""
        duties = duties_match.group(1).strip().replace("\n", " ") if duties_match else ""
        enquiries = enquiries_match.group(1).strip() if enquiries_match else ""
        closing_date = closing_match.group(1).strip() if closing_match else ""

        # Append structured record
        data.append({
            "Post": post,
            "Centre": centre,
            "Salary": salary,
            "Requirements": requirements,
            "Duties": duties,
            "Enquiries": enquiries,
            "Closing Date": closing_date
        })

    except Exception as e:
        print(f"⚠️ Error processing a post: {e}")
        continue


In [6]:
# ------------------------------------------------------------
# Step 4: Convert extracted data into a DataFrame
# ------------------------------------------------------------
df = pd.DataFrame(data)

# Export to Excel for external use
df.to_excel('dpsa_excel.xlsx')

# Display last record for verification
df.tail(1)


Unnamed: 0,Post,Centre,Salary,Requirements,Duties,Enquiries,Closing Date
467,ASSISTANT DIRECTOR: QUOTATION ADMINISTRATION R...,"Department of Social Development, Western Cape...",R468 459 - R561 894 per annum (Level 09),An appropriate 3-year B-Degree/Advanced Diplom...,Manage and supervise staff regarding the follo...,Ms T Rakiep Tel No: (021) 483 4720,22 December 2025 at 16:00


In [7]:
# ------------------------------------------------------------
# Step 5: Extract structured salary details
# ------------------------------------------------------------
import numpy as np

def extract_salary_details(salary_str):
    """
    Extracts min, max salary, and salary level from a given salary string.
    Returns a tuple: (min_salary, max_salary, salary_level).
    """
    min_salary, max_salary = np.nan, np.nan
    salary_level = None

    if pd.isna(salary_str):
        return (min_salary, max_salary, salary_level)

    original_salary_str = str(salary_str).lower().strip()
    temp_salary_str = original_salary_str

    # Remove parentheses and 'per annum'
    temp_salary_str = re.sub(r'\s*\(.*\)', '', temp_salary_str)
    temp_salary_str = re.sub(r'\s*per\s+annum.*', '', temp_salary_str)

    # Remove 'R' and commas
    temp_salary_str = temp_salary_str.replace('r', '').replace(',', '')

    # Extract numeric ranges
    if '–' in temp_salary_str or '-' in temp_salary_str:
        range_numbers_str = re.findall(r'\d+(?:\s*\d+)*(?:\.\d+)?', temp_salary_str)
        if len(range_numbers_str) >= 2:
            try:
                min_salary = float(range_numbers_str[0].replace('\xa0', ''))
                max_salary = float(range_numbers_str[1].replace('\xa0', ''))
            except ValueError:
                pass
    else:
        numbers_str = ''.join(re.findall(r'\d+(?:\s*\d+)*(?:\.\d+)?', temp_salary_str)).replace('\xa0', '')
        try:
            min_salary = float(numbers_str)
        except (ValueError, TypeError):
            pass

    # Extract salary level
    level_match = re.search(r'level\s*(\d+|[A-Z])', original_salary_str, re.IGNORECASE)
    if level_match:
        salary_level = level_match.group(1).strip()

    return (min_salary, max_salary, salary_level)

# Apply to DataFrame
df[['Min_Salary', 'Max_Salary', 'Salary_Level']] = df['Salary'].apply(extract_salary_details).apply(pd.Series)

# Display results
display(df[['Post', 'Salary', 'Min_Salary', 'Max_Salary', 'Salary_Level']])


Unnamed: 0,Post,Salary,Min_Salary,Max_Salary,Salary_Level
0,SCIENTIST PRODUCTION - GRADE A REF NO: 3/3/1/1...,"R761 157 per annum, (OSD), (all-inclusive sala...",,,
1,SCIENTIST PRODUCTION - GRADE A REF NO: 3/3/1/2...,"R761 157 per annum, (OSD), (all-inclusive sala...",,,
2,ASSISTANT DIRECTOR: SENIOR MANAGEMENT SERVICES...,R468 459 per annum (Level 09),,,09
3,CHIEF MONITORING ANALYST REF NO: 3/3/1/14/2025,R468 459 per annum (Level 09),,,09
4,SENIOR ORGANIZATIONAL DEVELOPMENT PRACTITIONER...,R397 116 per annum (Level 08),,,08
...,...,...,...,...,...
463,PERSONAL ASSISTANT: CORPORATE COMMUNICATION RE...,R325 101 – R382 959 per annum (Level 07),,,07
464,SUPPLY CHAIN MANAGEMENT PRACTITIONER: PROCUREM...,R397 116 - R467 790 per annum (Level 08),,,08
465,CHILD AND YOUTH CARE TEAM LEADER: PROFESSIONAL...,"Grade 1: R203 748 – R230 700 per annum, (OSD a...",1.0,,
466,ASSISTANT DIRECTOR: MONITORING AND REPORTING R...,R468 459 - R561 894 per annum (Level 09),,,09


In [8]:
# ------------------------------------------------------------
# Step 6: Extract NQF levels and qualifications
# ------------------------------------------------------------
def extract_nqf_info(requirements_str):
    """
    Extracts NQF level(s) and qualifications from a requirements string.
    Handles explicit mentions (e.g., 'NQF Level 6') and implicit qualifications (LLB, Grade 12, etc.).
    Returns a tuple: (levels, qualifications).
    """
    if pd.isna(requirements_str):
        return (np.nan, np.nan)

    levels = set()

    # Explicit NQF matches
    explicit_nqf_matches = re.findall(r'(?:NQF\s*[-_]?Level|Level|NQF)\s*(\d+)', requirements_str, re.IGNORECASE)
    for m in explicit_nqf_matches:
        levels.add(int(m))

    # Mapping dictionary
    nqf_map = {
        1: "Grade 9 / GETC (ABET Level 4)",
        2: "Grade 10 / NC(V) Level 2",
        3: "Grade 11 / NC(V) Level 3",
        4: "Grade 12 / NSC or NC(V) Level 4",
        5: "Higher Certificate",
        6: "Diploma / Advanced Certificate",
        7: "Bachelor’s Degree / Advanced Diploma",
        8: "Honours Degree / Postgraduate Diploma / LLB / MBChB / Medical Practitioner",
        9: "Master’s Degree / MMed / Medical Specialist",
        10: "Doctoral Degree (PhD)"
    }

    qualifications = [nqf_map.get(l, "") for l in sorted(list(levels))]
    return (list(levels) if levels else np.nan, qualifications if qualifications else np.nan)

# Apply row-wise
df[['NQF Level', 'Qualifications']] = df['Requirements'].apply(lambda x: pd.Series(extract_nqf_info(x)))


In [10]:
# ------------------------------------------------------------
# Step 7: Display NQF and Qualification counts side by side
# ------------------------------------------------------------
from IPython.display import HTML

def side_by_side(*dfs):
    """Display multiple pandas DataFrames side by side in Jupyter Notebook."""
    html = '<div style="display:flex">'
    for df in dfs:
        html += '<div style="margin-right:2em">'
        html += df.to_html()
        html += '</div>'
    html += '</div>'
    display(HTML(html))

# Show missing percentages
print(f'{((df['Qualifications'].isna().sum()/len(df))*100).round(1)} % Qualifications rows is still empty')
print(f'{((df['NQF Level'].isna().sum()/len(df))*100).round(1)} % NQF Level rows is still empty')
side_by_side(df['NQF Level'].value_counts().reset_index(),
             df['Qualifications'].value_counts().reset_index())
display(df)

70.9 % Qualifications rows is still empty
70.9 % NQF Level rows is still empty


Unnamed: 0,NQF Level,count
0,[7],54
1,[6],50
2,"[6, 7]",11
3,[4],9
4,[8],3
5,"[4, 6, 7]",3
6,[3],2
7,"[48, 7]",1
8,"[4, 6]",1
9,"[4, 5]",1

Unnamed: 0,Qualifications,count
0,[Bachelor’s Degree / Advanced Diploma],54
1,[Diploma / Advanced Certificate],50
2,"[Diploma / Advanced Certificate, Bachelor’s Degree / Advanced Diploma]",11
3,[Grade 12 / NSC or NC(V) Level 4],9
4,[Honours Degree / Postgraduate Diploma / LLB / MBChB / Medical Practitioner],3
5,"[Grade 12 / NSC or NC(V) Level 4, Diploma / Advanced Certificate, Bachelor’s Degree / Advanced Diploma]",3
6,[Grade 11 / NC(V) Level 3],2
7,"[Bachelor’s Degree / Advanced Diploma, ]",1
8,"[Grade 12 / NSC or NC(V) Level 4, Diploma / Advanced Certificate]",1
9,"[Grade 12 / NSC or NC(V) Level 4, Higher Certificate]",1


Unnamed: 0,Post,Centre,Salary,Requirements,Duties,Enquiries,Closing Date,Min_Salary,Max_Salary,Salary_Level,NQF Level,Qualifications
0,SCIENTIST PRODUCTION - GRADE A REF NO: 3/3/1/1...,Gauteng (Pretoria): Head Office,"R761 157 per annum, (OSD), (all-inclusive sala...",The applicant must be in possession of Grade 1...,"Develop and implement methodologies, systems a...",Mr MJ Mudzunga Tel No: (012) 319 6502,22 December 2025 at 16:00,,,,,
1,SCIENTIST PRODUCTION - GRADE A REF NO: 3/3/1/2...,Western Cape (Stellenboch Office),"R761 157 per annum, (OSD), (all-inclusive sala...",The applicant must be in possession of Grade 1...,"Develop and implement methodologies, systems a...",Mr Theo Pongolo Tel No: (021) 809 1640,22 December 2025 at 16:00,,,,,
2,ASSISTANT DIRECTOR: SENIOR MANAGEMENT SERVICES...,Gauteng (Pretoria): Head Office,R468 459 per annum (Level 09),Applicants must be in possession of a Grade 12...,Supervise the allocated resources of the Sub D...,Ms Sharon Waverley Tel No: (012) 312 9758,22 December 2025 at 16:00,,,09,[6],[Diploma / Advanced Certificate]
3,CHIEF MONITORING ANALYST REF NO: 3/3/1/14/2025,Gauteng (Pretoria): Head Office,R468 459 per annum (Level 09),Applicants must be in possession of a Bachelor...,Coordinate organisational performance analysis...,Mr Evans Kgasago Tel No: (012) 312 8063/8068,22 December 2025 at 16:00,,,09,[6],[Diploma / Advanced Certificate]
4,SENIOR ORGANIZATIONAL DEVELOPMENT PRACTITIONER...,Gauteng (Pretoria): Head Office,R397 116 per annum (Level 08),Applicants must be in possession of a National...,Conduct job analysis and job evaluation: ackno...,Ms Lexy Manamela Tel No: (012) 312 9569,22 December 2025 at 16:00,,,08,[6],[Diploma / Advanced Certificate]
...,...,...,...,...,...,...,...,...,...,...,...,...
463,PERSONAL ASSISTANT: CORPORATE COMMUNICATION RE...,"Department of the Premier, Western Cape Govern...",R325 101 – R382 959 per annum (Level 07),Grade 12 (Senior Certificate or equivalent qua...,Executive support and office management: Manag...,Ms F Steyn Tel No: (021) 483 9955,22 December 2025 at 16:00,,,07,[5],[Higher Certificate]
464,SUPPLY CHAIN MANAGEMENT PRACTITIONER: PROCUREM...,"Provincial treasury, Western Cape Government",R397 116 - R467 790 per annum (Level 08),An appropriate 3-year National Diploma/B-Degre...,"Coordinate, review, undertake and implement th...",Mr N Rhapale Tel No: (021) 483 6107,22 December 2025 at 16:00,,,08,,
465,CHILD AND YOUTH CARE TEAM LEADER: PROFESSIONAL...,"Department of Social Development, Western Cape...","Grade 1: R203 748 – R230 700 per annum, (OSD a...",Grade 12 (Senior Certificate or equivalent qua...,Serve as a team leader for child and youth car...,Ms B Nicholas Tel No: (044) 803 7508,22 December 2025 at 16:00,1.0,,,,
466,ASSISTANT DIRECTOR: MONITORING AND REPORTING R...,"Department of Social Development, Western Cape...",R468 459 - R561 894 per annum (Level 09),An appropriate 3-year National Diploma/B-Degre...,Develop and implement Departmental performance...,Ms S Nieftagodien at sihaam.nieftagodien@weste...,22 December 2025 at 16:00,,,09,,
