# **Agentic Job Targeting Workflow for Data Science**

Cassandra Maldonado

In [1]:
import re
import json
from typing import Dict, List, Any
from collections import Counter
from dataclasses import dataclass, asdict
from datetime import datetime

try:
    from yattag import Doc, indent
    import PyPDF2
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "yattag", "PyPDF2", "nltk"])
    from yattag import Doc, indent
    import PyPDF2
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

# NLTK.
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    print("NLTK data skipped.")

In [2]:
# Structure for the resume sections.
class ResumeSection:
    name: str
    email: str
    phone: str
    summary: str
    experience: List[Dict[str, Any]]
    education: List[Dict[str, str]]
    skills: List[str]
    projects: List[Dict[str, Any]]

# Resume formatter that creates a better output from the data.
class Resumeformatter:

    def __init__(self):
        pass

    def formatting_resume(self, resume_data: Dict, job_requirements: Dict, template='professional') -> str:

        # Cleaning the data.
        enhanced_data = self.info_resume_data(resume_data, job_requirements)

        # Generating HTML with a template.
        html_content = self.html_with_template(enhanced_data, template)

        return html_content

# Clean and improve the resume info.
    def info_resume_data(self, resume_data: Dict, job_requirements: Dict) -> Dict:
        enhanced = {}

        # Name.
        enhanced['name'] = self.cleanning_name(resume_data.get('name', 'Professional'))

        # Contact info.
        enhanced['email'] = self.clean_email(resume_data.get('email', ''))
        enhanced['phone'] = self.clean_phone(resume_data.get('phone', ''))

        # Improved summary with job keywords.
        enhanced['summary'] = self._summary(
            resume_data.get('summary', ''),
            job_requirements
        )

        # Organizing experience.
        enhanced['experience'] = self.cleaning_experience(resume_data.get('experience', []))

        # Organizing education.
        enhanced['education'] = self.cleaning_education(resume_data.get('education', []))

        # Skills.
        enhanced['skills'] = self.organize_skills(
            resume_data.get('skills', []),
            job_requirements
        )

        # Projects.
        enhanced['projects'] = self.cleaning_projects(resume_data.get('projects', []))

        return enhanced

# Cleaning the name field.
    def cleanning_name(self, name: str) -> str:
        if not name or name.lower() == 'candidate':
            return "CASSANDRA M. SULLIVAN"

        name = name.strip()

        # If it looks like contact info got mixed in I extract just the name part.
        if '@' in name or '(' in name:
            parts = name.split()
            clean_parts = []
            for part in parts:
                if '@' not in part and '(' not in part and not part.isdigit():
                    clean_parts.append(part)
            name = ' '.join(clean_parts[:4])

        return name if name else "CASSANDRA M. SULLIVAN"

# Cleaning the email.
    def clean_email(self, email: str) -> str:
        if not email:
            return ""

        email = email.split('|')[0].strip()

        if '@' in email and '.' in email:
            return email

        return ""

# Cleanining the phone.
    def clean_phone(self, phone: str) -> str:
        if not phone:
            return ""

        phone = re.sub(r'[^\d\(\)\-\.\s\+]', '', phone)

        return phone.strip()

# Imrpoving the summary with the job keywords.
    def _summary(self, summary: str, job_requirements: Dict) -> str:
        if not summary or len(summary) < 50:
            summary = """Data scientist with proven expertise in machine learning and generative AI,
            skilled in developing automated evaluation pipelines and fine-tuning LLMs. Brings robust
            experience in Python, ML/NLP libraries, and A/B testing to optimize model performance at scale.
            Recognized for translating complex datasets into actionable, real-world insights."""

        # Cleaning up the summary.
        if len(summary) > 500:
            sentences = summary.split('.')
            clean_sentences = []
            for sentence in sentences[:3]:
                sentence = sentence.strip()
                if (len(sentence) > 20 and len(sentence) < 200 and
                    not any(x in sentence.lower() for x in ['phone', 'email', 'education', 'experience'])):
                    clean_sentences.append(sentence)

            if clean_sentences:
                summary = '. '.join(clean_sentences) + '.'

        # Adding the job relevant info.
        job_skills = job_requirements.get('skills', []) + job_requirements.get('tools', [])
        if job_skills:
            top_skills = ', '.join(job_skills[:3])
            if 'generative ai' not in summary.lower() and any('ai' in skill.lower() for skill in job_skills):
                summary += f" Specialized in {top_skills} with focus on autonomous systems and model deployment."

        return summary

# Cleaning the experience.
    def cleaning_experience(self, experience: List[Dict]) -> List[Dict]:
        cleaned = []

        for exp in experience:
            if not exp.get('company') or exp.get('company') == 'Previous Organization':
                continue

            clean_exp = {
                'title': exp.get('title', '').strip(),
                'company': exp.get('company', '').strip(),
                'duration': exp.get('duration', '').strip(),
                'achievements': []
            }

            # Cleaning achievements.
            for achievement in exp.get('achievements', []):
                if isinstance(achievement, str) and len(achievement.strip()) > 15:
                    clean_achievement = achievement.strip()
                    clean_achievement = clean_achievement.lstrip('•-* ')
                    clean_exp['achievements'].append(clean_achievement)

            if clean_exp['company'] and clean_exp['title']:
                cleaned.append(clean_exp)

        return cleaned

# Cleaning education.
    def cleaning_education(self, education: List[Dict]) -> List[Dict]:
        cleaned = []

        for edu in education:
            if edu.get('degree') and len(edu.get('degree', '')) > 5:
                clean_edu = {
                    'school': edu.get('school', '').strip(),
                    'degree': edu.get('degree', '').strip(),
                    'date': edu.get('date', '').strip(),
                    'details': edu.get('details', '').strip()
                }
                cleaned.append(clean_edu)

        return cleaned

# Organizing the skills into categories.
    def organize_skills(self, skills: List[str], job_requirements: Dict) -> Dict[str, List[str]]:
        categories = {
            'Programming & Scripting': [],
            'Machine Learning & AI': [],
            'Data Analysis & Visualization': [],
            'Statistical Methods': [],
            'Cloud & Infrastructure': []
        }

        # Keyword mappings.
        category_keywords = {
            'Programming & Scripting': ['python', 'r', 'sql', 'git', 'jupyter', 'pandas', 'numpy', 'matplotlib'],
            'Machine Learning & AI': ['machine learning', 'ai', 'ml', 'tensorflow', 'pytorch', 'xgboost', 'random forest', 'neural', 'nlp', 'generative', 'llm', 'deep learning'],
            'Data Analysis & Visualization': ['tableau', 'visualization', 'dashboard', 'analytics', 'streamlit', 'matplotlib'],
            'Statistical Methods': ['statistics', 'bayesian', 'time series', 'forecasting', 'causal inference', 'a/b testing'],
            'Cloud & Infrastructure': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'snowflake']
        }

        # Categorizing the skills.
        for skill in skills:
            if not skill or len(skill.strip()) < 2:
                continue

            skill_clean = skill.strip()
            skill_lower = skill_clean.lower()
            categorized = False

            for category, keywords in category_keywords.items():
                if any(keyword in skill_lower for keyword in keywords):
                    if skill_clean not in categories[category]:
                        categories[category].append(skill_clean)
                    categorized = True
                    break

            # Default category for uncategorized skills.
            if not categorized and len(skill_clean) < 20:
                categories['Programming & Scripting'].append(skill_clean)

        # Removing empty categories and duplicates.
        cleaned_categories = {}
        for category, skill_list in categories.items():
            if skill_list:
                cleaned_categories[category] = list(set(skill_list))

        return cleaned_categories

# Cleaning project entries.
    def cleaning_projects(self, projects: List[Dict]) -> List[Dict]:
        cleaned = []

        for project in projects:
            if project.get('name') and len(project.get('name', '')) > 3:
                clean_project = {
                    'name': project.get('name', '').strip(),
                    'duration': project.get('duration', '').strip(),
                    'description': project.get('description', '').strip(),
                    'technologies': project.get('technologies', [])
                }

                # Cleaning the description.
                if clean_project['description']:
                    desc = clean_project['description']
                    desc = desc.lstrip('•-* ')
                    if len(desc) > 300:
                        desc = desc[:300] + '...'
                    clean_project['description'] = desc

                cleaned.append(clean_project)

        return cleaned

# Generating the HTML with a template.
    def html_with_template(self, data: Dict, template: str) -> str:
        return self.setting_template()(data)

    def setting_template(self):
        def template(data):
            html = f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{data["name"]} - Resume</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, sans-serif;
            line-height: 1.6;
            color: #333;
            background: #f8f9fa;
            padding: 20px;
        }}
        .container {{
            max-width: 900px;
            margin: 0 auto;
            background: white;
            border-radius: 10px;
            overflow: hidden;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
        }}
        .header {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 40px;
            text-align: center;
        }}
        .header h1 {{
            font-size: 2.8em;
            font-weight: 300;
            margin-bottom: 10px;
            letter-spacing: 2px;
        }}
        .header p {{
            font-size: 1.1em;
            opacity: 0.9;
        }}
        .content {{ padding: 40px; }}
        .section {{ margin-bottom: 40px; }}
        .section h2 {{
            color: #667eea;
            font-size: 1.4em;
            margin-bottom: 20px;
            padding-bottom: 10px;
            border-bottom: 2px solid #667eea;
            font-weight: 600;
        }}
        .summary {{
            font-size: 1.1em;
            line-height: 1.8;
            color: #555;
            background: #f8f9fa;
            padding: 25px;
            border-radius: 8px;
            border-left: 4px solid #667eea;
        }}
        .experience-item, .education-item, .project-item {{
            margin-bottom: 30px;
            padding: 25px;
            background: #f8f9fa;
            border-radius: 8px;
            border-left: 4px solid #28a745;
        }}
        .job-title, .degree, .project-title {{
            font-size: 1.2em;
            font-weight: 600;
            color: #333;
            margin-bottom: 8px;
        }}
        .company, .school {{
            font-weight: 600;
            color: #28a745;
            margin-bottom: 5px;
        }}
        .duration {{
            color: #666;
            font-style: italic;
            margin-bottom: 15px;
        }}
        .achievements {{
            list-style: none;
            margin-top: 15px;
        }}
        .achievements li {{
            margin: 10px 0;
            padding-left: 20px;
            position: relative;
        }}
        .achievements li:before {{
            content: "▶";
            color: #28a745;
            font-weight: bold;
            position: absolute;
            left: 0;
        }}
        .skills-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
            gap: 25px;
        }}
        .skill-category {{
            background: #f8f9fa;
            padding: 20px;
            border-radius: 8px;
            border-left: 4px solid #667eea;
        }}
        .category-title {{
            font-weight: 600;
            color: #333;
            margin-bottom: 15px;
            font-size: 1.1em;
        }}
        .skill-list {{
            display: flex;
            flex-wrap: wrap;
            gap: 8px;
        }}
        .skill-tag {{
            background: linear-gradient(135deg, #667eea, #764ba2);
            color: white;
            padding: 6px 14px;
            border-radius: 20px;
            font-size: 0.9em;
            font-weight: 500;
        }}
        .project-description {{
            margin: 15px 0;
            line-height: 1.7;
        }}
        .technologies {{
            margin-top: 12px;
        }}
        .tech-label {{
            font-weight: 600;
            color: #667eea;
        }}
        .tech-list {{
            color: #666;
        }}
        @media print {{
            body {{ background: white; padding: 0; }}
            .container {{ box-shadow: none; }}
        }}
    </style>
</head>
<body>
    <div class="container">
        <header class="header">
            <h1>{data["name"]}</h1>
            <p>'''

            contact_parts = []
            if data.get("phone"):
                contact_parts.append(data["phone"])
            if data.get("email"):
                contact_parts.append(data["email"])
            html += " | ".join(contact_parts)

            html += f'''</p>
            <div class="links">
                <a href="https://github.com/CassandraMaldonado" target="_blank">🔗 GitHub</a>
                <a href="https://www.linkedin.com/in/cassandra-msullivan/" target="_blank">🔗 LinkedIn</a>
            </div>
        </header>

        <div class="content">'''

            # Summary section.
            if data.get("summary"):
                html += f'''
            <section class="section">
                <h2>Professional Summary</h2>
                <div class="summary">{data["summary"]}</div>
            </section>'''

            # Experience section.
            if data.get("experience"):
                html += '''
            <section class="section">
                <h2>Professional Experience</h2>'''
                for exp in data["experience"]:
                    html += f'''
                <div class="experience-item">
                    <div class="job-title">{exp.get("title", "")}</div>
                    <div class="company">{exp.get("company", "")}</div>
                    <div class="duration">{exp.get("duration", "")}</div>'''
                    if exp.get("achievements"):
                        html += '<ul class="achievements">'
                        for achievement in exp["achievements"]:
                            html += f'<li>{achievement}</li>'
                        html += '</ul>'
                    html += '</div>'
                html += '</section>'

            # Education section.
            if data.get("education"):
                html += '''
            <section class="section">
                <h2>Education</h2>'''
                for edu in data["education"]:
                    html += f'''
                <div class="education-item">
                    <div class="degree">{edu.get("degree", "")}</div>
                    <div class="school">{edu.get("school", "")}</div>
                    <div class="duration">{edu.get("date", "")}</div>'''
                    if edu.get("details"):
                        html += f'<div style="margin-top: 10px; color: #666;">{edu["details"]}</div>'
                    html += '</div>'
                html += '</section>'

            # Projects section.
            if data.get("projects"):
                html += '''
            <section class="section">
                <h2>Key Projects</h2>'''
                for project in data["projects"]:
                    html += f'''
                <div class="project-item">
                    <div class="project-title">{project.get("name", "")}</div>'''
                    if project.get("duration"):
                        html += f'<div class="duration">{project["duration"]}</div>'
                    if project.get("description"):
                        html += f'<div class="project-description">{project["description"]}</div>'
                    if project.get("technologies"):
                        html += f'''<div class="technologies">
                            <span class="tech-label">Technologies:</span>
                            <span class="tech-list">{", ".join(project["technologies"])}</span>
                        </div>'''
                    html += '</div>'
                html += '</section>'

            # Skills section.
            if data.get("skills"):
                html += '''
            <section class="section">
                <h2>Technical Skills</h2>
                <div class="skills-grid">'''
                for category, skills in data["skills"].items():
                    if skills:
                        html += f'''
                    <div class="skill-category">
                        <div class="category-title">{category}</div>
                        <div class="skill-list">'''
                        for skill in skills:
                            html += f'<span class="skill-tag">{skill}</span>'
                        html += '</div></div>'
                html += '</div></section>'

            html += '''
        </div>
    </div>
</body>
</html>'''
            return html
        return template

# Creating the file upload widget.
def upload_resume_widget():
    try:
        from google.colab import files
        print("Upload your resume (PDF, DOCX, or TXT).")
        uploaded = files.upload()
        if uploaded:
            filename = list(uploaded.keys())[0]
            return filename
        return None
    except ImportError:
        print("Please upload your resume file.")
        import os
        files = [f for f in os.listdir('.') if f.lower().endswith(('.pdf', '.docx', '.txt'))]
        if files:
            print(f"Found: {files}")
            return files[0]
        return None

# Parsing the resume.
def resume_parser(file_path: str) -> str:
    try:
        if file_path.lower().endswith('.pdf'):
            import PyPDF2
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text
        elif file_path.lower().endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        else:
            return ""
    except Exception as e:
        print(f"Error reading file: {e}.")
        return ""

# Getting the job description from the user.
def interactive_job_input():
    print("\n Paste the job description.")
    print("When finished, type 'DONE' on a new line:")

    lines = []
    while True:
        try:
            line = input()
            if line.strip().upper() == 'DONE':
                break
            lines.append(line)
        except KeyboardInterrupt:
            return None

    return "\n".join(lines)

def format_resume(resume_data, job_requirements, output_filename=None):

    if not output_filename:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f"datascience_resume_{timestamp}.html"

    formatter = Resumeformatter()

    # Converting the resume object to dict.
    if hasattr(resume_data, '__dict__'):
        resume_dict = {
            'name': getattr(resume_data, 'name', ''),
            'email': getattr(resume_data, 'email', ''),
            'phone': getattr(resume_data, 'phone', ''),
            'summary': getattr(resume_data, 'summary', ''),
            'experience': getattr(resume_data, 'experience', []),
            'education': getattr(resume_data, 'education', []),
            'skills': getattr(resume_data, 'skills', []),
            'projects': getattr(resume_data, 'projects', [])
        }
    else:
        resume_dict = resume_data

    # Converting the job requirements to dict.
    if hasattr(job_requirements, '__dict__'):
        job_dict = {
            'skills': getattr(job_requirements, 'skills', []),
            'tools': getattr(job_requirements, 'tools', []),
            'keywords': getattr(job_requirements, 'keywords', [])
        }
    else:
        job_dict = job_requirements

    # HTML.
    html_content = formatter.formatting_resume(resume_dict, job_dict, 'professional')

    # Saving.
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"New resume generated: {output_filename}")
    print(f"Name: {resume_dict.get('name', 'Unknown')}")
    print(f"Experience entries: {len(resume_dict.get('experience', []))}")
    print(f"Education entries: {len(resume_dict.get('education', []))}")
    print(f"Projects: {len(resume_dict.get('projects', []))}")
    print(f"Skill categories: {len(resume_dict.get('skills', {})) if isinstance(resume_dict.get('skills', {}), dict) else 'Multiple'}")

    return output_filename

In [4]:
def run_workflow():

    print("Resume workflow.")
    print("-" * 50)

    # Getting the resume.
    resume_file = upload_resume_widget()
    if not resume_file:
        return None

    # Parsing the text.
    resume_text = resume_parser(resume_file)
    if not resume_text:
        return None

    print(f"Extracted {len(resume_text)} characters from the file.")

    # Getting the job description.
    jd_text = interactive_job_input()
    if not jd_text:
        return None

    # Manual extraction based on known content.
    experience = []
    education = []

    # Looking for specific text chunks that contain job info.
    text_lower = resume_text.lower()

    # Mexico Central Bank job.
    if 'mexico central bank' in text_lower:
        print("Found Mexico Central Bank job.")
        experience.append({
            "title": "Machine Learning & Data Science Economist",
            "company": "Mexico Central Bank",
            "duration": "May 2022 - Sep 2024",
            "achievements": [
                "Developed forecasting models (XGBoost, ARIMA, Prophet) that improved regional economic predictions by 15%.",
                "Applied causal inference (DiD, matching) and A/B testing to evaluate the impact of policy interventions.",
                "Conducted incrementality analysis to isolate true effects of programs on regional growth.",
                "Automated SQL pipelines while integrating generative AI tools to streamline data evaluation processes.",
                "Presented findings in policy briefings and national economic reports."
            ]
        })

    # Universidad job.
    if 'universidad' in text_lower and 'nuevo león' in text_lower:
        print("Found Universidad job.")
        experience.append({
            "title": "Consultant",
            "company": "Universidad Autónoma de Nuevo León (University-Industry Relations Office)",
            "duration": "Sep 2021 - May 2022",
            "achievements": [
                "Designed and deployed Python-based KPI dashboards to track cross-departmental performance, contributing to $500K MXN in operational savings.",
                "Leveraged A/B testing and uplift modeling in data analysis to isolate the incremental effects of pricing strategies on customer conversion."
            ]
        })

    # Secretary job.
    if 'secretary' in text_lower and 'finance' in text_lower:
        print("Found Secretary of Finance job.")
        experience.append({
            "title": "Financial Planning Analyst",
            "company": "Secretary of Finance and General Treasury of Nuevo León (State Gov. Office)",
            "duration": "Jun 2020 - Aug 2021",
            "achievements": [
                "Developed R-based forecasting models that improved budget accuracy by 23%.",
                "Automated SQL pipelines to streamline debt tracking, reducing servicing costs by 0.6%.",
                "Utilized causal inference (synthetic controls, quasi-experiments) to evaluate the impact of fiscal reforms.",
                "Facilitated evidence-based decision-making through real-time financial reporting."
            ]
        })

    # Education.
    if 'university of chicago' in text_lower:
        print("Found University of Chicago job.")
        education.append({
            "school": "University of Chicago, Physical Sciences Division",
            "degree": "Master of Science, Applied Data Science",
            "date": "Dec 2025",
            "details": "GPA: Magna Cum Laude • Data Science Institute Merit Scholarship"
        })

    if 'instituto' in text_lower and 'monterrey' in text_lower:
        print("Found Monterrey job.")
        education.append({
            "school": "Instituto Tecnológico y de Estudios Superiores de Monterrey",
            "degree": "Bachelor of Arts, Economics",
            "date": "Dec 2019",
            "details": "GPA: Magna Cum Laude"
        })

    print(f"\n Extraction Results:")
    print(f"Experience entries: {len(experience)}.")
    print(f"Education entries: {len(education)}.")

    # Creating the resume.
    resume_data = ResumeSection(
        name="CASSANDRA M. SULLIVAN",
        email="caseymr96@gmail.com",
        phone="(415) 286-1896",
        summary="""Data scientist with proven expertise in machine learning and generative AI, skilled in developing automated evaluation pipelines and fine-tuning LLMs. Brings robust experience in Python, ML/NLP libraries, and A/B testing to optimize model performance at scale. Recognized for translating complex datasets into actionable, real-world insights and integrating advanced ML techniques within quality assurance frameworks.""",
        experience=experience,
        education=education,
        skills=[
            "Python", "R", "SQL", "STATA", "Git", "Jupyter", "Pandas", "NumPy",
            "Matplotlib", "Scikit-Learn", "Machine Learning", "Deep Learning",
            "Generative AI", "LLMs", "VAEs", "Classification", "Regression",
            "XGBoost", "Random Forest", "A/B Testing", "NLP", "TensorFlow",
            "PyTorch", "Statistics", "Bayesian Inference", "Causal Inference",
            "Time Series Forecasting", "Tableau", "Streamlit"
        ],
        projects=[
            {
                "name": "Healthcare LLM (Inference Analytics)",
                "duration": "Mar 2025 - Present",
                "description": "Collaborating on a healthcare-specialized LLM fine-tuned with Reinforcement Learning from EHRs and clinical notes. Focused on prompt engineering, dataset preparation, and reward modeling.",
                "technologies": ["Python", "Reinforcement Learning", "LLM", "EHRs"]
            },
            {
                "name": "AirfareCast: Airline Fare Forecasting",
                "duration": "Mar 2025",
                "description": "Built machine learning models (XGBoost, Random Forest) to predict flight prices. Deployed an interactive Streamlit dashboard to visualize fare trends and optimize booking decisions.",
                "technologies": ["Python", "XGBoost", "Random Forest", "Streamlit"]
            },
            {
                "name": "Conditional VAE for Age-Controlled Face Generation",
                "duration": "Jan 2025",
                "description": "Designed a Conditional Variational Autoencoder (CVAE) to simulate age-based facial transformations. Implemented data preprocessing and achieved latent space disentanglement.",
                "technologies": ["Python", "CVAE", "TensorFlow", "Computer Vision"]
            },
            {
                "name": "Customer Behavior Analysis Pipeline",
                "duration": "Nov 2024",
                "description": "Built a SQL-Python pipeline to analyze 6.75M+ e-commerce records using A/B testing and time series analysis.",
                "technologies": ["Python", "SQL", "A/B Testing", "Time Series"]
            },
            {
                "name": "John List Voltage Research Program",
                "duration": "Sep 2024 - Present",
                "description": "Used Bayesian modeling and causal inference to quantify real-world treatment effects in behavioral experiments, evaluated incremental outcomes from randomized interventions.",
                "technologies": ["Python", "Bayesian Modeling", "Causal Inference", "R"]
            }
        ]
    )

    print(f"\n Job Requirements Analysis.")
    print("-" * 40)

    job_requirements = {
        'skills': [],
        'tools': [],
        'keywords': []
    }

    jd_lower = jd_text.lower()

    # Extracting ML/AI skills from the job description.
    ml_keywords = ['machine learning', 'deep learning', 'ai', 'generative ai', 'neural networks', 'nlp', 'autonomous driving', 'generative models', 'synthetic data']
    tech_keywords = ['python', 'tensorflow', 'pytorch', 'sql', 'docker', 'kubernetes']

    for keyword in ml_keywords:
        if keyword in jd_lower:
            job_requirements['skills'].append(keyword.title())

    for keyword in tech_keywords:
        if keyword in jd_lower:
            job_requirements['tools'].append(keyword.title())

    # Extracting keywords.
    import re
    words = re.findall(r'\b[a-zA-Z]{4,}\b', jd_lower)
    stop_words = {'the', 'and', 'for', 'with', 'that', 'this', 'will', 'from', 'they', 'have', 'your', 'our'}
    filtered_words = [w for w in words if w not in stop_words]
    word_counts = {}
    for word in filtered_words:
        word_counts[word] = word_counts.get(word, 0) + 1

    # Getting the most common keywords.
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    job_requirements['keywords'] = [word for word, count in sorted_words[:15] if count > 1]

    print(f"Found job skills: {job_requirements['skills']}")
    print(f"Found job tools: {job_requirements['tools']}")
    print(f"Found keywords: {job_requirements['keywords'][:5]}...")

    beautiful_file = format_resume(resume_data, job_requirements)

    # Generating the interview talking points.
    print(f"\n Interview Prep.")
    print("-" * 40)

    interview_points = interview_talking_points(resume_data, job_requirements)

    print(f"New resume: {beautiful_file}")
    print(f"{len(resume_data.experience)} experience entries extracted.")
    print(f"{len(resume_data.projects)} projects extracted.")
    print(f"{len(resume_data.education)} education entries extracted.")
    print(f"{len(interview_points)} interview talking points generated.")

    print(f"\n Interview talking points.")
    print("-" * 50)
    for i, point in enumerate(interview_points, 1):
        print(f"\n{i}. {point}")

    return resume_data, job_requirements, beautiful_file

def interview_talking_points(resume_data, job_requirements):
    points = []

    # Technical expertise.
    skills = job_requirements.get('skills', [])
    if skills:
        top_skills = ', '.join(skills[:3])
        points.append(f"Technical Deep Dive: Prepare a detailed walkthrough of your {top_skills} experience. Use your Healthcare LLM project as an example, explain the technical approach, challenges with EHR data, and how you implemented Reinforcement Learning for fine-tuning.")

    # Problem-solving with examples.
    points.append("Problem-Solving Examples: Prepare STAR method stories for: 1) Your Mexico Central Bank forecasting models that improved predictions by 15%, what obstacles did you overcome? 2) The Universidad KPI dashboard that saved $500K MXN, how did you identify the key metrics? 3) A time when your initial ML approach failed and how you pivoted.")

    # Career progression.
    points.append("Career Progression Story: Highlight your growth from Financial Planning Analyst to ML & Data Science Economist. Emphasize: increasing technical complexity (R forecasting → Python ML pipelines → Generative AI), expanding scope (budget accuracy → policy impact → healthcare applications), and leadership development.")

    # Collaboration.
    points.append("Stakeholder Communication: Prepare examples of: • Presenting complex ML findings to policy makers at Mexico Central Bank • Translating technical limitations to university administrators • Working with cross-departmental teams on KPI dashboards • How you balance technical accuracy with business needs.")

    # Research.
    if any('research' in skill.lower() for skill in job_requirements.get('keywords', [])):
        points.append("Research Excellence: Discuss your John List Voltage Research Program work with Bayesian modeling and causal inference. Explain how you bridge academic research with practical applications, and your approach to experimental design in behavioral studies.")

    if any('autonomous' in keyword.lower() for keyword in job_requirements.get('keywords', [])):
        points.append("Autonomous Systems Insight: Connect your generative AI and ML pipeline experience to autonomous driving challenges. Discuss data quality issues you've solved, model validation approaches from your forecasting work, and how your A/B testing expertise applies to autonomous system evaluation.")

    # Business impact.
    points.append("Quantified Business Impact: Prepare specific metrics: • 15% improvement in economic predictions (Mexico Central Bank) • $500K MXN operational savings (Universidad) • 23% budget accuracy improvement (Treasury) • 6.75M+ records analyzed (Customer Behavior project) • Multiple ongoing research projects showing sustained impact.")

    # Questions.
    points.append("Strategic Questions: Ask thoughtful questions like: • 'What are the biggest challenges in scaling generative models for autonomous systems?' • 'How does the team balance research exploration with production requirements?' • 'What's your approach to ensuring safety and reliability in ML model deployment?' • 'How do you measure success for generative AI initiatives in this domain?'")

    return points

def quick_resume_workflow():
    return run_workflow()

def print_usage_instructions():
    print("Run: run_workflow()")

print_usage_instructions()

Run: run_workflow()


In [5]:
run_workflow()

Resume workflow.
--------------------------------------------------
Upload your resume (PDF, DOCX, or TXT).


Saving Cassandra_Sullivan_DataScience.pdf to Cassandra_Sullivan_DataScience.pdf
Extracted 4406 characters from the file.

 Paste the job description.
When finished, type 'DONE' on a new line:
 Define and execute the ML roadmap for synthetic data generation using generative AI, evolving both model and infrastructure to meet the training and evaluation needs of Zoox’s autonomous driving solution. Lead the development of generative models from small scale objects to complete scenarios, from research all the way to deployment. Design effective model architectures and sophisticated training techniques, leveraging all the inputs from our sensor stack and the overall large scale data we have at Zoox. Collaborate with perception, planning, safety, simulation, and systems teams to integrate your models into our offline pipelines. Validate and optimize your solutions using real-world driving scenarios, directly contributing to the safety and reliability of Zoox's autonomous system.
DONE
Found Me

(ResumeSection(name='CASSANDRA M. SULLIVAN', email='caseymr96@gmail.com', phone='(415) 286-1896', summary='Data scientist with proven expertise in machine learning and generative AI, skilled in developing automated evaluation pipelines and fine-tuning LLMs. Brings robust experience in Python, ML/NLP libraries, and A/B testing to optimize model performance at scale. Recognized for translating complex datasets into actionable, real-world insights and integrating advanced ML techniques within quality assurance frameworks.', experience=[{'title': 'Machine Learning & Data Science Economist', 'company': 'Mexico Central Bank', 'duration': 'May 2022 - Sep 2024', 'achievements': ['Developed forecasting models (XGBoost, ARIMA, Prophet) that improved regional economic predictions by 15%.', 'Applied causal inference (DiD, matching) and A/B testing to evaluate the impact of policy interventions.', 'Conducted incrementality analysis to isolate true effects of programs on regional growth.', 'Automate

In [7]:
#!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [11]:
import fitz
import difflib
from bs4 import BeautifulSoup

# My original resume in PDF.
with open("Cassandra_Sullivan_DataScience.pdf", "rb") as f:
    import fitz
    doc = fitz.open(stream=f.read(), filetype="pdf")
    original_text = "\n".join([page.get_text() for page in doc])

# Data science tailored resume in html.
with open("datascience_resume_20250805_225420.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")
    tailored_text = soup.get_text()

# Unified diff.
diff = difflib.unified_diff(
    original_text.splitlines(),
    tailored_text.splitlines(),
    fromfile="Original Resume (PDF)",
    tofile="Tailored Resume (HTML)",
    lineterm=""
)


diff_output = list(diff)
print("\n".join(diff_output[:300]) or "No changes.")

--- Original Resume (PDF)
+++ Tailored Resume (HTML)
@@ -1,68 +1,92 @@
- 
- 
-CASSANDRA M. SULLIVAN 
-+ 1 (415) 286-1896 | cassandramr@uchicago.edu | in/cassandra-msullivan |  Github
-Data scientist with experience in generative AI and large-scale ML systems. I’ve built A/B testing frameworks and automated 
-evaluation pipelines that made it easier to track model performance and translate results into decisions in research, product or 
-policy. 
-WORK EXPERIENCE 
-Mexico Central Bank (Banco de Mexico) 
-May 2022 - Sep 2024 
-Machine Learning & Data Science Economist                                                                                                                Monterrey, Mexico 
-• Developed forecasting models (XGBoost, ARIMA, Prophet) that improved regional economic predictions by 15%. 
-• Applied causal inference (DiD, matching) and A/B testing to evaluate the impact of policy interventions. 
-• Conducted incrementality analysis to isolate true effects of programs on r

## Summary of Resume Changes

These modifications were made to my resume for the Apple Ads – Data Scientist, Research Analytics role:

### Structural & Visual Enhancements
- Improved the resume format using HTML with sections, categorized skill tags and a more readable layout.
- Improved skimmability with bolded headers, bullet groupings and cleaner spacing.

### Professional Summary
- Rewritten to highlight ML/LLM expertise, A/B testing and the ability to translate data into operational impact.
- Expanded from 2 lines to a detailed summary focused on automated pipelines, GEN AI and QA integration.

### Experience Section
- Removed redundant phrasing like location headers.
- Quantified outcomes more clearly, for example went from $500K MXN in savings to a 0.6% cost reduction.
- Made achievements more results and impact oriented.
- Removed phrasing like aligning with large-scale model automation principles to streamline bullets.

### Education
- Degree names reformatted from Master in Applied Data Science to Master of Science, Applied Data Science for consistency with U.S. academic formats.
- Location information was removed.

### Projects
- Grouped technologies used into clearly labeled stacks.
- Added concise project descriptions.
- Keept all the original projects but improved their language.

### Technical Skills
- Split and categorized technical skills into four areas:
  - Programming & Scripting
  - ML & AI
  - Visualization
  - Statistical Methods
- Replaced bullet lists with tag type labels, improving ATS compatibility.

### Contact
- Added LinkedIn and GitHub links with visual icons.
- Used a header and contact info in a cleaner design format.

These refinements align my resume with Apple’s expectations for experimentation, LLM experience, model automation and business impact communication.