<a href="https://colab.research.google.com/github/Ayuathm/Job_market_Analysis_AI_SSD/blob/main/Complete_Job_Info_Extraction_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Job Info and Skill Extraction with spaCy in Google Colab

## 🔧 Step 1: Install spaCy and Download the Large Model

In [1]:
!pip install -U spacy
!python -m spacy download en_core_web_lg

Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.8.6
    Uninstalling spacy-3.8.6:
      Successfully uninstalled spacy-3.8.6
Successfully installed spacy-3.8.7
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[

## 📦 Step 2: Load Libraries

In [2]:
import pandas as pd
import re
import spacy
from dateutil import parser
from google.colab import files

## 📁 Step 3: Upload Your CSV File

In [3]:
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

Saving extracted_job_texts.csv to extracted_job_texts.csv


## 🧹 Step 4: Clean the Text

In [4]:
def clean_text(text):
    text = re.sub(r'(CamScanner|Page\s\d+|\s*@\s*)', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)

## 📌 Step 5: Load spaCy and Define Extractors

In [5]:
nlp = spacy.load('en_core_web_lg')

# Predefined skill keywords
predefined_skills = set([
    'python', 'java', 'excel', 'word', 'powerpoint', 'sql', 'r', 'gis', 'arcgis',
    'monitoring', 'evaluation', 'communication', 'leadership', 'report writing',
    'project management', 'budgeting', 'data analysis', 'problem solving', 'teamwork',
    'procurement', 'logistics', 'negotiation', 'networking', 'supervision',
    'customer service', 'human resources', 'training', 'coaching', 'facilitation',
    'presentation', 'graphic design', 'research', 'documentation', 'compliance'
])

def clean_skill_phrase(phrase):
    phrase = phrase.strip('.,;:-() ').title()
    if len(phrase) < 3:
        return None
    blacklist = {'Terms', 'Summary', 'Position', 'Page', 'Section', 'Introduction', 'Document'}
    words = set(phrase.lower().split())
    if words & {w.lower() for w in blacklist}:
        return None
    return phrase

def extract_skills(text):
    doc = nlp(text.lower())
    extracted = set()
    for token in doc:
        if token.text in predefined_skills:
            extracted.add(token.text.title())
    for chunk in doc.noun_chunks:
        cleaned = clean_skill_phrase(chunk.text)
        if cleaned:
            extracted.add(cleaned)
    return ', '.join(sorted(extracted)) if extracted else 'Not Found'

## 🧠 Step 6: Extract Job Title, Location, and Posted Date

In [6]:
def extract_job_title(text):
    patterns = [
        r'(job title|position|title|vacancy)[:\-]?\s*(.+)',
        r'we are looking for[:\-]?\s*(.+)',
        r'role[:\-]?\s*(.+)',
        r'job opening[:\-]?\s*(.+)',
        r'recruiting[:\-]?\s*(.+)'
    ]
    lines = text.strip().split('\n')[:20]
    for line in lines:
        for pattern in patterns:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                return match.group(2).strip()
    return 'Unknown'

known_places = [
    'Juba', 'Wau', 'Malakal', 'Bor', 'Yambio', 'Rumbek', 'Aweil', 'Yei', 'Torit',
    'Bentiu', 'Terekeeka', 'Kapoeta', 'Maridi', 'Tonj', 'Abyei', 'Pibor', 'Akobo',
    'Leer', 'Renk', 'Kodok', 'Paloich', 'Tali', 'Magwi', 'Koch', 'Pariang'
]
def extract_location(text):
    for place in known_places:
        if re.search(rf'\b{place}\b', text, re.IGNORECASE):
            return place
    return 'Unknown'

def extract_posting_date(text):
    for pattern in [
        r'\b(\d{1,2}\s+\w+\s+\d{4})\b',
        r'\b(\w+\s+\d{1,2},\s+\d{4})\b',
        r'\b(\d{4}-\d{2}-\d{2})\b'
    ]:
        match = re.search(pattern, text)
        if match:
            try:
                return parser.parse(match.group(1), fuzzy=True).strftime('%Y-%m-%d')
            except:
                pass
    return 'Unknown'

## ✅ Step 7: Apply All Extractors

In [7]:
df['skills'] = df['cleaned_text'].apply(extract_skills)
df['job_title'] = df['cleaned_text'].apply(extract_job_title)
df['location'] = df['cleaned_text'].apply(extract_location)
df['posting_date'] = df['cleaned_text'].apply(extract_posting_date)

## 🏢 Step 8: Add Organization, Sector, and Job Type Extraction

In [8]:
def extract_organization(text):
    known_orgs = [
        'Save the Children', 'ZOA', 'UNICEF', 'UNHCR', 'WFP', 'IRC', 'CARE', 'World Vision', 'Medair', 'ACTED',
        'Plan International', 'Danish Refugee Council', 'Oxfam', 'CAFOD', 'Caritas', 'ADRA', 'Cordaid',
        'Concern Worldwide', 'Norwegian Refugee Council', 'CTG', 'GOAL', 'GIZ', 'Mercy Corps', 'INTERSOS',
        'Catholic Relief Services', 'UNDP', 'FAO', 'WHO', 'MSF', 'IOM', 'ACF', 'War Child'
    ]
    lines = text.strip().split('\n')[:30]
    for line in lines:
        for org in known_orgs:
            if org.lower() in line.lower():
                return org
    return 'Unknown'

def classify_sector(text):
    sectors = {
        'Education': ['school', 'education', 'teacher', 'training'],
        'Health': ['health', 'clinic', 'medical', 'nutrition', 'hiv', 'malaria'],
        'Humanitarian': ['ngo', 'unicef', 'humanitarian', 'relief', 'emergency', 'refugee'],
        'Agriculture': ['farm', 'agriculture', 'livestock', 'crop'],
        'Logistics': ['logistics', 'transport', 'fleet', 'supply chain'],
        'Finance': ['finance', 'accounting', 'audit', 'budget', 'grants', 'payroll'],
        'WASH': ['water', 'sanitation', 'hygiene'],
        'Protection': ['protection', 'gender', 'child protection', 'gbv'],
        'ICT': ['ict', 'information technology', 'systems', 'database', 'network']
    }
    for sector, keywords in sectors.items():
        for kw in keywords:
            if re.search(rf'\b{kw}\b', text, re.IGNORECASE):
                return sector
    return 'Unknown'

def classify_job_type(text):
    job_types = {
        'Full-time': ['full time', 'permanent', 'long-term', 'staff position'],
        'Part-time': ['part time', 'temporary', 'short-term', 'casual'],
        'Consultancy': ['consultant', 'consultancy', 'contract basis', 'individual contractor'],
        'Internship': ['intern', 'internship', 'trainee'],
        'Volunteer': ['volunteer', 'voluntary service']
    }
    for jtype, keywords in job_types.items():
        for kw in keywords:
            if re.search(rf'\b{kw}\b', text, re.IGNORECASE):
                return jtype
    return 'Unclassified'

## 🔄 Step 9: Apply All New Extractors

In [9]:
df['organization'] = df['cleaned_text'].apply(extract_organization)
df['sector'] = df['cleaned_text'].apply(classify_sector)
df['job_type'] = df['cleaned_text'].apply(classify_job_type)

## 💾 Step 10: Export All Job Info

In [10]:
df[[
    'filename', 'organization', 'job_title', 'sector', 'job_type',
    'location', 'posting_date', 'skills'
]].to_csv('final_job_data_complete.csv', index=False)
files.download('final_job_data_complete.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>