<a href="https://colab.research.google.com/github/Dorsa77/Dorsa77/blob/main/final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install pdfplumber
!pip install pytesseract
import os
import re
import csv
import json
from datetime import datetime
from dateutil import parser as dateparser
from dateutil.relativedelta import relativedelta

# Colab-friendly Resume Analyzer: mount Google Drive, parse PDFs, save CSV with matching
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = '/content/drive/MyDrive'
except ImportError:
    BASE_PATH = os.getcwd()

# File and directory paths
PDF_DIR = os.path.join(BASE_PATH, 'resumes')
OUTPUT_DIR = os.path.join(BASE_PATH, 'result')
OUTPUT_CSV = os.path.join(OUTPUT_DIR, 'output.csv')
SKILLS_FILE = os.path.join(BASE_PATH, 'skills.json')
JOB_FILE = os.path.join(BASE_PATH, 'job_requirements.json')

# Imports for parsing
import pdfplumber
import pytesseract
from PIL import Image
import spacy
from spacy.matcher import PhraseMatcher

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load skill list (dynamic)
def load_skills(path):
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    # default list
    return [
        'python','java','c++','c#','javascript','sql','django','flask',
        'machine learning','data analysis','excel','powerpoint','tensorflow',
        'react','node.js','aws','docker','kubernetes'
    ]
skill_list = load_skills(SKILLS_FILE)

# Build PhraseMatcher for skills
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
patterns = [nlp.make_doc(s) for s in skill_list]
matcher.add('SKILL', None, *patterns)
skill_alias = {'aws': 'AWS (Amazon Web Services)'}

# Load job requirements

def load_job_requirements(path):
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return None

job_req = load_job_requirements(JOB_FILE)

# Date patterns for experience
month = r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|' \
        r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)'
year = r'\d{4}'
date_patterns = [
    rf'{month} {year}\s*(?:to|-|–)\s*(?:Present|{month} {year})',
    rf'\b{year}\s*(?:to|-|–)\s*(?:Present|{year})\b'
]
combined_dates = re.compile('|'.join(date_patterns), re.IGNORECASE)

# Utility functions

def extract_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += (page.extract_text() or '') + '\n'
    if not text.strip():
        for page in pdfplumber.open(pdf_path).pages:
            img = page.to_image(resolution=300).original
            text += pytesseract.image_to_string(img) + '\n'
    return text


def normalize(text):
    return ' '.join(text.split())

# Field extractors

def extract_email(text):
    m = re.search(r'[\w\.-]+@[\w\.-]+', text)
    return m.group(0) if m else ''


def extract_phone(text):
    m = re.search(r'(?:\+?\d{1,3}[\s\.-]?)?(?:\(\d{2,4}\)[\s\.-]?|\d{2,4}[\s\.-]?)?\d{3,4}[\s\.-]?\d{3,4}', text)
    return m.group(0) if m else ''


def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text
    return ''


def extract_education(text):
    pats = [r"Bachelor(?:'s)? of [A-Za-z]+", r"Master(?:'s)? of [A-Za-z]+", r"PhD in [A-Za-z]+"]
    ed = set()
    for pat in pats:
        for m in re.findall(pat, text, re.IGNORECASE):
            ed.add(m)
    return '; '.join(ed)


def extract_skills(text):
    doc = nlp(text)
    found = set()
    for _, start, end in matcher(doc):
        skill = doc[start:end].text.lower()
        found.add(skill_alias.get(skill, skill))
    return '; '.join(sorted(found))

# Experience parsing

def parse_dates(line):
    m = combined_dates.search(line)
    if not m:
        return None, None
    date_str = m.group(0)
    parts = re.split(r'(?:to|-|–)', date_str)
    try:
        start = dateparser.parse(parts[0].strip())
    except:
        start = None
    end_str = parts[1].strip()
    if re.search(r'present', end_str, re.IGNORECASE):
        end = datetime.today()
    else:
        try:
            end = dateparser.parse(end_str)
        except:
            end = None
    return start, end


def calc_months(start, end):
    if not start or not end:
        return 0
    delta = relativedelta(end, start)
    return delta.years * 12 + delta.months


def extract_experience(raw):
    lines = raw.split('\n')
    total_months = 0
    details = []
    for ln in lines:
        ln_norm = normalize(ln)
        if combined_dates.search(ln_norm):
            start, end = parse_dates(ln_norm)
            months = calc_months(start, end)
            total_months += months
            if ' at ' in ln_norm:
                title, company = ln_norm.split(' at ', 1)
            elif ' - ' in ln_norm:
                title, company = ln_norm.split(' - ', 1)
            else:
                title, company = '', ''
            company = re.sub(combined_dates, '', company).strip()
            details.append({'line': ln_norm, 'title': title.strip(), 'company': company, 'months': months})
    return details, total_months

# Matching functions

def compute_skill_match(resume_skills, req_skills):
    if not req_skills:
        return None
    res_set = set(s.strip().lower() for s in resume_skills.split(';') if s)
    req_set = set(s.lower() for s in req_skills)
    matched = res_set & req_set
    return len(matched) / len(req_set) * 100


def compute_experience_match(total_months, req_years):
    if req_years is None:
        return None
    return min(total_months / (req_years * 12) * 100, 100)

# Main parsing and matching

def parse_resume(path):
    raw = extract_text(path)
    text = normalize(raw)
    details, total_months = extract_experience(raw)
    record = {
        'file': os.path.basename(path),
        'name': extract_name(text),
        'email': extract_email(text),
        'phone': extract_phone(text),
        'skills': extract_skills(text),
        'education': extract_education(text),
        'total_experience_months': total_months
    }
    if job_req:
        sm = compute_skill_match(record['skills'], job_req.get('required_skills', [])) or 0
        em = compute_experience_match(total_months, job_req.get('min_experience_years')) or 0
        record['skill_match_pct'] = round(sm, 2)
        record['exp_match_pct'] = round(em, 2)
        record['overall_match_pct'] = round((sm + em) / 2, 2)
    return record

# Ensure directories exist
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Process resumes and write CSV
fieldnames = ['file','name','email','phone','skills','education','total_experience_months']
if job_req:
    fieldnames += ['skill_match_pct','exp_match_pct','overall_match_pct']

rows = []
for fn in os.listdir(PDF_DIR):
    if fn.lower().endswith('.pdf'):
        rows.append(parse_resume(os.path.join(PDF_DIR, fn)))

with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f'Done! Results saved to {OUTPUT_CSV}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Done! Results saved to /content/drive/MyDrive/result/output.csv


In [36]:
!pip install streamlit pyngrok pdfplumber pytesseract pillow spacy python-dateutil
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [37]:
# نصب وابستگی‌ها
!pip install pdfplumber pytesseract pillow spacy python-dateutil

# دانلود مدل spaCy
!python -m spacy download en_core_web_sm

# (اختیاری) اگر می‌خواهی مراحل بعدی در Drive ذخیره شوند:
from google.colab import drive
drive.mount('/content/drive')


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
%%writefile resume_parser.py
import os, re, csv, json
from datetime import datetime
from dateutil import parser as dateparser
from dateutil.relativedelta import relativedelta
import pdfplumber, pytesseract, spacy
from spacy.matcher import PhraseMatcher
from PIL import Image

# اگر مونت کردی:
BASE = '/content/drive/MyDrive' if os.path.exists('/content/drive/MyDrive') else os.getcwd()
PDF_DIR = os.path.join(BASE, 'resumes')
OUT_DIR = os.path.join(BASE, 'result')
OUT_CSV = os.path.join(OUT_DIR, 'output.csv')

# spaCy و مهارت‌ها
nlp = spacy.load('en_core_web_sm')
default_skills = ['python','java','c++','c#','javascript','sql','django','flask','machine learning','data analysis','excel','powerpoint','tensorflow','react','node.js','aws','docker','kubernetes']
SKILLS = json.load(open(os.path.join(BASE,'skills.json'))) if os.path.exists(os.path.join(BASE,'skills.json')) else default_skills
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add('SKILL', None, *[nlp.make_doc(s) for s in SKILLS])

# (اختیاری) بارگذاری تنظیمات شغل از JSON
JOB_JSON = os.path.join(BASE,'job_requirements.json')
job_req = json.load(open(JOB_JSON)) if os.path.exists(JOB_JSON) else None

# توابع کمکی...
def extract_text(path):
    txt=''
    with pdfplumber.open(path) as pdf:
        for p in pdf.pages:
            txt += (p.extract_text() or '') + '\n'
    if not txt.strip():
        for p in pdfplumber.open(path).pages:
            img = p.to_image(resolution=300).original
            txt += pytesseract.image_to_string(img) + '\n'
    return txt

def normalize(t): return ' '.join(t.split())
def extract_email(t):
    m=re.search(r'[\w\.-]+@[\w\.-]+',t); return m.group(0) if m else ''
def extract_phone(t):
    m=re.search(r'(?:\+?\d{1,3}[\s\.-]?)?(?:\(\d{2,4}\)[\s\.-]?|\d{2,4}[\s\.-]?)?\d{3,4}[\s\.-]?\d{3,4}',t)
    return m.group(0) if m else ''
def extract_name(t):
    for ent in nlp(t).ents:
        if ent.label_=='PERSON': return ent.text
    return ''
def extract_education(t):
    patterns=[r\"Bachelor(?:'s)? of [A-Za-z]+\",r\"Master(?:'s)? of [A-Za-z]+\",r\"PhD in [A-Za-z]+\"]
    ed=set()
    for p in patterns: ed.update(re.findall(p,t,re.IGNORECASE))
    return '; '.join(ed)
def extract_skills(t):
    doc=nlp(t); found=set()
    for _,s,e in matcher(doc):
        span=doc[s:e].text.lower(); found.add(span)
    return '; '.join(sorted(found))

# تجربه کاری
date_rx = re.compile(r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|\d{4})\s*(?:to|-|–)\s*(?:Present|\d{4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)? \d{4}))',re.IGNORECASE)
def parse_dates(l):
    m=date_rx.search(l)
    if not m: return None,None
    a,b=re.split(r'(?:to|-|–)',m.group(0))
    try: s=dateparser.parse(a.strip())
    except: s=None
    br=b.strip()
    e=datetime.today() if re.search('present',br, re.IGNORECASE) else dateparser.parse(br)
    return s,e
def calc_months(s,e):
    if not s or not e: return 0
    d=relativedelta(e,s); return d.years*12+d.months
def extract_experience(raw):
    rows=[]; total=0
    for line in raw.split('\n'):
        ln=normalize(line)
        if date_rx.search(ln):
            s,e=parse_dates(ln); m=calc_months(s,e); total+=m
            # تفکیک عنوان/شرکت (heuristic)
            if ' at ' in ln: t,c=ln.split(' at ',1)
            elif ' - ' in ln: t,c=ln.split(' - ',1)
            else: t,c='',ln
            rows.append({'line':ln,'title':t.strip(),'company':c.strip(),'months':m})
    return rows,total

# محاسبه مطابقت
def compute_skill_match(rs,qs):
    if not qs: return None
    rset=set(s.strip().lower() for s in rs.split(';') if s)
    qset=set(s.lower() for s in qs)
    return len(rset&qset)/len(qset)*100 if qset else None
def compute_exp_match(det,years):
    if years is None: return None
    tm=sum(d['months'] for d in det)
    return min(tm/(years*12)*100,100)

def parse_resume(path):
    raw=extract_text(path); txt=normalize(raw)
    det,tot=extract_experience(raw)
    rec={'file':os.path.basename(path),'name':extract_name(txt),'email':extract_email(txt),
         'phone':extract_phone(txt),'skills':extract_skills(txt),'education':extract_education(txt),
         'total_experience_months':tot,
         'experience_lines':'; '.join(d['line'] for d in det),
         'titles':'; '.join(d['title'] for d in det),
         'companies':'; '.join(d['company'] for d in det),
         'durations_months':'; '.join(str(d['months']) for d in det)}
    if job_req:
        sm=compute_skill_match(rec['skills'],job_req['required_skills']) or 0
        em=compute_exp_match(det,job_req['min_experience_years']) or 0
        rec['skill_match_pct']=round(sm,2); rec['exp_match_pct']=round(em,2)
        rec['overall_match_pct']=round((sm+em)/2,2)
    return rec

# اطمینان از وجود پوشه‌ها
os.makedirs(PDF_DIR,exist_ok=True); os.makedirs(OUT_DIR,exist_ok=True)

# اجرا و ذخیره CSV
rows=[]
for f in os.listdir(PDF_DIR):
    if f.lower().endswith('.pdf'):
        rows.append(parse_resume(os.path.join(PDF_DIR,f)))
with open(OUT_CSV,'w',newline='',encoding='utf-8') as f:
    w=csv.DictWriter(f,fieldnames=list(rows[0].keys()) if rows else [])
    w.writeheader(); w.writerows(rows)
print("Done!")```

---

## ۳. نمایش خروجی در DataFrame

در سلول آخر بنویس و اجرا کن:

```python
import pandas as pd
df = pd.read_csv(os.path.join(BASE_PATH, 'result', 'output.csv'))
df.head(20)


Overwriting resume_parser.py


In [41]:
import os
import pandas as pd

# مسیر فایل CSV خروجی
base = '/content/drive/MyDrive' if os.path.exists('/content/drive/MyDrive') else os.getcwd()
csv_path = os.path.join(base, 'result', 'output.csv')

# بارگذاری و نمایش چند ردیف اول
df = pd.read_csv(csv_path)
df.head(20)


Unnamed: 0,file,name,email,phone,skills,education,total_experience_months,skill_match_pct,exp_match_pct,overall_match_pct
0,jose.pdf,Juan Jose Carin,juanjose.carin@gmail.com,94041 650-336,AWS (Amazon Web Services); data analysis; mach...,Master of Information,24,66.67,66.67,66.67
1,resume-computer-engineering.pdf,Phone 555-555-5555,email@email.com,555-555-5555,c#; c++,Bachelor of Computer,0,0.0,0.0,0.0
2,Graduate_Software-Engineer-Resume-Example.pdf,STACY A. BILLS,first.last@ndsu.edu,701-555,c++; java,Bachelor of Science; Master of Science,312,0.0,100.0,50.0
