In [4]:
import pandas as pd

jobs_df = pd.read_csv("../data/jobs.csv")
jobs_df.head()


Unnamed: 0,job_id,title,description
0,1,Software Engineer,"Looking for someone skilled in Python, SQL, AW..."
1,2,Frontend Developer,"React, JavaScript, HTML/CSS needed. Bonus: Nod..."
2,3,AI Intern,"Machine learning, deep learning, TensorFlow, a..."


In [6]:
from pdfminer.high_level import extract_text
import os

resume_path = os.path.join("..", "data", "Stefanovic_CVv1.pdf")
text = extract_text(resume_path)
resume_text = text.lower()


In [9]:
import re

# Look for lines starting with "software engineer" and grab following text
experience_sections = re.findall(r'(software engineer.*?)(?=\n\n|\n[a-z]|$)', resume_text, re.DOTALL)

# Preview
for i, section in enumerate(experience_sections):
    print(f"🔹 Experience {i+1}:\n{section.strip()}\n")


🔹 Experience 1:
software engineer

🔹 Experience 2:
software engineering with concentration with ai



In [10]:
structured_experience = []

for section in experience_sections:
    lines = section.strip().split("\n")
    role = lines[0].strip() if lines else ""
    company = lines[1].strip() if len(lines) > 1 else ""
    description = "\n".join(lines[2:]).strip() if len(lines) > 2 else ""
    
    structured_experience.append({
        "role": role,
        "company": company,
        "description": description
    })


In [11]:
import json
print(json.dumps(structured_experience, indent=2))


[
  {
    "role": "software engineer",
    "company": "",
    "description": ""
  },
  {
    "role": "software engineering with concentration with ai",
    "company": "",
    "description": ""
  }
]


In [7]:
skill_keywords = [
    "python", "java", "javascript", "html", "css", "sql", "aws", "azure",
    "machine learning", "deep learning", "tensorflow", "pytorch",
    "git", "linux", "microsoft azure", "flask", "node.js", "docker",
    "data analysis", "nlp", "react", "express", "mongodb", "php"
]

matched_skills = [skill for skill in skill_keywords if skill in resume_text]


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine your resume's structured content into one text blob
resume_doc = " ".join(matched_skills) + " " + " ".join([exp['description'] for exp in structured_experience])

# Combine resume + job descriptions
documents = [resume_doc] + jobs_df['description'].tolist()

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute cosine similarity between resume (index 0) and job descriptions (1+)
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

# Add scores to DataFrame
jobs_df['match_score'] = similarities

# Sort by best match
top_matches = jobs_df.sort_values(by='match_score', ascending=False)

# Show top 3
top_matches[['title', 'match_score']].head(3)


Unnamed: 0,title,match_score
1,Frontend Developer,0.206373
0,Software Engineer,0.101446
2,AI Intern,0.037303
