In [4]:
import pandas as pd
import re
import pdfplumber

In [None]:
file_path = "CS_Prospectus.pdf"

In [7]:
# Helper to extract matches from the first page (or specified page) of a PDF.
def extract_matches_from_pdf_first_page(file_path, keywords, page_index=0, case_sensitive=False):
    """Return list of dicts {'match_line': str, 'next_line': str} for matching lines on page_index."""
    # ensure keywords is iterable
    if not isinstance(keywords, (list, tuple, set)):
        keywords = [keywords]

    with pdfplumber.open(file_path) as pdf:
        # guard page_index bounds
        if page_index < 0 or page_index >= len(pdf.pages):
            raise IndexError(f"page_index {page_index} out of range for {len(pdf.pages)} pages")
        text = pdf.pages[page_index].extract_text() or ""

    # split into non-empty lines
    lines = [ln.strip() for ln in text.split("\n")]

    results = []
    for i, ln in enumerate(lines):
        if not ln:
            continue
        for kw in keywords:
            if not kw:
                continue
            if case_sensitive:
                matched = kw in ln
            else:
                matched = kw.lower() in ln.lower()
            if matched:
                nxt = lines[i + 1] if i + 1 < len(lines) else ""
                results.append({"match_line": ln, "next_line": nxt})
                break
    return results


keywords = ["Dr.", "Aamir", "Farooq", "Mudassar", "Muhammad", "Qaiser Farooq", "Samreen"]

matches = extract_matches_from_pdf_first_page(file_path, keywords, page_index=0, case_sensitive=False)

# Print results (only match_line and next_line)
print('\nDetected matches (match_line -> next_line):')
for m in matches:
    print(f"- {m['match_line']}  ->  {m['next_line']}")


Detected matches (match_line -> next_line):
- Dr. Hafiz Muhammad Faisal Shehzad  ->  Associate Professor / Chairperson
- Muhammad Ali Block  ->  FACULTY
- Dr. Muhammad Ilyas MATH-5101 URCQ-5102 Multivariable Calculus 3 (3-0)  ->  Professor
- Dr. Saad Razaq following combinations:  ->  Assistant Professor I. Pre-Engineering URCQ-5111 Translation of Holy Quran-I 0 (1-0)
- Dr. Qaiser Abbas II. Pre-Medical (Admitted candidates have to pass 6-credit hours  ->  Total 17(15-9)
- Dr. Fahad Maqbool  ->  Assistant Professor a. Math, Stat, Phy b. Math, Stat, Eco CMPC-5205CMPC-5202 Data Structures 4(3-1)
- Dr. Hussam Ali e. Math, Eco, Comp CMPC-5209CMPC-5204 Computer Org. & Assembly Language 3(2-1)  ->  Assistant Professor
- Aamir Zia IBCC) with at least 50% obtained marks  ->  Lecturer Duration: 4 Years MATH-5102 URCQ-5102 Linear Algebra 3(3-0)
- Farooq Javed Semesters: 8 CMPC-5101 Software Engineering 3(3-0)  ->  Lecturer Degree Requirements: 125-149 Credit Hours
- Mudassar Ali Zaidi  ->  Lectu

In [None]:
def normalize_faculty_matches(matches):
    import re

    code_pattern = re.compile(r"\b[A-Z]{2,4}-\d{3,4}\b")
    cut_tokens = re.compile(r"\b(Semester|Semesters|Duration|Total|Code|C/Hr|Course|Semester-?\d+|\d+\s*[(])\b", re.IGNORECASE)

    subject_words = set([
        'calculus','multivariable','software','engineering','data','computer','information',
        'algorithms','logic','design','probability','statistics','database','security','application',
        'automata','communications','communication','technologies','physics','geometry','translation',
        'holy','quran','mathematics','math','chemistry','biology','economics','english','analysis',
        'systems','advanced','topics','special','functional','expository'
    ])

    designation_keywords = [
        'Associate Professor\\s*/\\s*Chairperson',
        'Associate Professor',
        'Assistant Professor',
        'Professor',
        'Lecturer',
        'Chairperson',
        'Visiting Professor',
        'Visiting Faculty',
        'On leave'
    ]
    designation_patterns = [re.compile(k, re.IGNORECASE) for k in designation_keywords]

    results = []
    for m in matches:
        raw = m.get('match_line', '')
        nxt = m.get('next_line', '')

        # designation
        designation = ''
        for p in designation_patterns:
            found = p.search(nxt)
            if found:
                designation = found.group(0).strip()
                break
        if not designation:
            for p in designation_patterns:
                found = p.search(raw)
                if found:
                    designation = found.group(0).strip()
                    break

        # name extraction
        line = code_pattern.sub('', raw)
        tokens_raw = line.split()

        name_tokens = []
        for raw_tok in tokens_raw:
            sanitized = raw_tok.strip(' ,;:.()')
            if not sanitized:
                continue
            low = sanitized.lower()

            # break conditions using original token (so parentheses detection works)
            if '(' in raw_tok or ')' in raw_tok:
                break
            if re.search(r"\d", raw_tok):
                break
            if low in subject_words:
                break
            if low in ('following','combinations','with','obtained','marks','semester','semesters'):
                break
            if re.fullmatch(r'(i|ii|iii|iv|v|vi|vii|viii|ix|x)\.?', low):
                break
            # uppercase acronyms likely not part of name (e.g., IBCC)
            if sanitized.isupper() and len(sanitized) >= 2 and sanitized not in ('DR','MR','MS','PROF'):
                break
            # single-letter token like 'e' likely not part of name
            if len(sanitized) == 1:
                break

            name_tokens.append(sanitized)

        if name_tokens:
            name = ' '.join(name_tokens).strip(' -:;,.')
        else:
            split_at = cut_tokens.split(raw)
            name = split_at[0].strip(' -:;,.') if split_at else raw.strip()

        # final cleanup
        name = re.sub(r"\s{2,}", ' ', name).strip()
        if not name:
            name = raw.strip()

        results.append({'name': name, 'designation': designation, 'raw_match': raw, 'raw_next_line': nxt})

    return results

In [7]:
matches = extract_matches_from_pdf_first_page(file_path, keywords, page_index=0)
cleaned = normalize_faculty_matches(matches)

print('\nCleaned faculty entries:')
for c in cleaned:
    name = c['name']
    desig = c['designation'] or '<designation not found>'
    print(f"- {name}  ->  {desig}")


Cleaned faculty entries:
- Dr Hafiz Muhammad Faisal Shehzad  ->  Associate Professor / Chairperson
- Muhammad Ali Block  ->  <designation not found>
- Dr Muhammad Ilyas  ->  Professor
- Dr Saad Razaq  ->  Assistant Professor
- Dr Qaiser Abbas  ->  <designation not found>
- Dr Fahad Maqbool  ->  Assistant Professor
- Dr Hussam Ali  ->  Assistant Professor
- Aamir Zia  ->  Lecturer
- Farooq Javed  ->  Lecturer
- Mudassar Ali Zaidi  ->  Lecturer
- Muhammad Zohaib Nawaz  ->  Lecturer
- Muhammad Fahad  ->  Lecturer
- Qaisar Farooq  ->  Lecturer
- Samreen Razzaq  ->  Lecturer


In [8]:
faculty_df = pd.DataFrame(cleaned)[['name', 'designation']]

faculty_df = faculty_df.drop(index=1)
# Adding "Assistant Professor (on leave)" in the designation column for "Dr Qaiser Abbas"
faculty_df.loc[faculty_df['name'] == 'Dr Qaiser Abbas', 'designation'] = 'Assistant Professor (on leave)'
# Reset index
faculty_df = faculty_df.reset_index(drop=True)

faculty_df.loc[faculty_df['name'] == 'Muhammad Zohaib Nawaz', 'designation'] = 'Lecturer (on leave)'
faculty_df

Unnamed: 0,name,designation
0,Dr Hafiz Muhammad Faisal Shehzad,Associate Professor / Chairperson
1,Dr Muhammad Ilyas,Professor
2,Dr Saad Razaq,Assistant Professor
3,Dr Qaiser Abbas,Assistant Professor (on leave)
4,Dr Fahad Maqbool,Assistant Professor
5,Dr Hussam Ali,Assistant Professor
6,Aamir Zia,Lecturer
7,Farooq Javed,Lecturer
8,Mudassar Ali Zaidi,Lecturer
9,Muhammad Zohaib Nawaz,Lecturer (on leave)


In [9]:
faculty_df
# Let's save to CSV
faculty_df.to_csv("Faculty.csv", index=False)

- Extracting Programs data

The csv file would look something like this: *Programs.csv*: (ProgramName, Eligibility, Duration, TotalCreditHours)

In [10]:
file_path = "CS_Prospectus.pdf"

In [21]:
# Now we have to extract Programs data (e.g., Program Name, Eligibility, Duration, Total Credit Hours)

program_headers = [
    "BS in Computer Science", 
    "BS in Artificial Intelligence", 
    "BS in Data Science", 
    "MS Computer Science", 
    "PhD Computer Science",
    "MS Artificial Intelligence"
]
course_code_pattern = r"[A-Z]{4}-\d{4}"

# All this information is present in the section following the program headers. But for now, we will just extract the program names.
programs = []
with pdfplumber.open(file_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text() or ''
        lines = [ln.strip() for ln in text.split('\n') if ln.strip()]
        for i, ln in enumerate(lines):
            for header in program_headers:
                if re.search(re.escape(header), ln, re.IGNORECASE):
                    programs.append({'ProgramName': header})

programs_df = pd.DataFrame(programs)

In [37]:
with pdfplumber.open("CS_Prospectus.pdf") as pdf:
    text = pdf.pages[0].extract_text()

lines = text.split("\n")

# find indexes
start = next(i for i, line in enumerate(lines) if "Eligibility" in line)
end   = next(i for i, line in enumerate(lines) if "Duration" in line)

eligibility_text = "\n".join(lines[start:end])


In [52]:
print(eligibility_text)

Eligibility: HSSC (Part-I / II) with at least 50% marks with one of the
CMPC-5208 Computer Networks 3 (2-3)
Dr. Saad Razaq following combinations:
Assistant Professor I. Pre-Engineering URCQ-5111 Translation of Holy Quran-I 0 (1-0)
Dr. Qaiser Abbas II. Pre-Medical (Admitted candidates have to pass 6-credit hours
Total 17(15-9)
Assistant Professor (on leave) courses of mathematics in first two semesters.)
III. General Science Semester-3
Dr. Fahad Maqbool
Assistant Professor a. Math, Stat, Phy b. Math, Stat, Eco CMPC-5205CMPC-5202 Data Structures 4(3-1)
c. Math, Stat, Comp d. Math, Phy, Comp
Dr. Hussam Ali e. Math, Eco, Comp CMPC-5209CMPC-5204 Computer Org. & Assembly Language 3(2-1)
Assistant Professor
IV. A-Levels (with of the contribution of I / II / III and equivalence by CMPC-5207 Artificial Intelligence 3(2-1)
Aamir Zia IBCC) with at least 50% obtained marks
