# CoverLetter AI Generator

This notebook integrates LangGraph's `StateGraph` with the Groq API to generate personalized cover letters.

## Setup

In [1]:
# @title Install required packages
!pip install -qq gradio > /dev/null 2>&1
!pip install -qqU langgraph > /dev/null 2>&1
!pip install -qq pymupdf python-docx pandas openpyxl typing-extensions> /dev/null 2>&1
!pip install -qq groq > /dev/null 2>&1

# Suppress warnings & logs
import warnings, logging
warnings.filterwarnings("ignore")

print("✅ Environment ready (quiet mode)")

✅ Environment ready (quiet mode)


In [2]:
# @title Imports
import os
os.environ["USER_AGENT"] = "CoverLetter_AI/1.0 (contact: pricotu@outlook.com)"
from dataclasses import dataclass
from typing import List, Optional, TypedDict, Any
from typing_extensions import TypedDict, NotRequired
from pathlib import Path
import re
import tempfile
import pandas as pd
import json

# LangGraph imports
from langgraph.graph import StateGraph, START, END

# Groq import
from groq import Groq

# Gradio
import gradio as gr

# File libs
import fitz
import docx

In [3]:
# @title Configuration
@dataclass
class Config:
    GROQ_API_KEY: str = os.getenv('GROQ_API_KEY','API_KEY_HERE')
    MODEL_NAME: str = os.getenv('GROQ_MODEL','llama-3.3-70b-versatile')
    MAX_TOKENS: int = 2500
    TEMPERATURE: float = 0.7

config = Config()
print('Config loaded. Model:', config.MODEL_NAME)

Config loaded. Model: llama-3.3-70b-versatile


## BACK END

In [4]:
# @title Groq client wrapper (simple string output)
class GroqClient:
    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or config.GROQ_API_KEY
        if not self.api_key:
            print('[GroqClient] Warning: GROQ_API_KEY not set; using placeholder responses.')
            self.client = None
        else:
            if Groq is None:
                raise ImportError('groq package not installed or importable.')
            self.client = Groq(api_key=self.api_key)
    def generate(self, prompt: str, system_prompt: Optional[str] = None, max_tokens: Optional[int] = None) -> str:
        # Simple: return only text
        try:
            messages = []
            if system_prompt:
                messages.append({'role': 'system', 'content': system_prompt})
            messages.append({'role': 'user', 'content': prompt})
            if self.client is None:
                # placeholder response for offline testing
                return '[groq placeholder] ' + prompt[:400]
            response = self.client.chat.completions.create(
                model=config.MODEL_NAME,
                messages=messages,
                max_tokens=max_tokens or config.MAX_TOKENS,
                temperature=config.TEMPERATURE
            )
            # The Groq response object shape may vary; adapt if necessary.
            return response.choices[0].message.content
        except Exception as e:
            return f'Error generating response: {str(e)}'

# instantiate
llm = GroqClient()


In [5]:
# @title File processing utilities
class FileProcessor:
    @staticmethod
    def extract_text_from_pdf(path: str) -> str:
        doc = fitz.open(path)
        text = '\n'.join([page.get_text() for page in doc])
        doc.close()
        return re.sub(r'\s+', ' ', text).strip()

    @staticmethod
    def extract_text_from_docx(path: str) -> str:
        doc = docx.Document(path)
        text = '\n'.join([p.text for p in doc.paragraphs])
        return re.sub(r'\s+', ' ', text).strip()

    @staticmethod
    def extract_text(path: str):
        if not path:
            return '', ''
        p = Path(path)
        ext = p.suffix.lower()
        if ext == '.pdf':
            return FileProcessor.extract_text_from_pdf(path), p.name
        if ext == '.docx':
            return FileProcessor.extract_text_from_docx(path), p.name
        if ext == '.txt':
            return open(path, 'r', encoding='utf-8').read(), p.name
        return '', p.name

    @staticmethod
    def load_excel_data(path: str) -> pd.DataFrame:
      if not path or not os.path.exists(path):
          return pd.DataFrame()

      try:
          all_sheets = pd.read_excel(path, sheet_name=None)
          dfs = []

          for sheet, df in all_sheets.items():
              df = df.copy()
              df['__sheet__'] = sheet
              dfs.append(df)

          if dfs:
              return pd.concat(dfs, ignore_index=True)
          return pd.DataFrame()
      except Exception as e:
          print(f"Error loading Excel file: {e}")
          return pd.DataFrame()

In [6]:
# @title Function to clean Skill Lists
def clean_and_parse_skills(text: str) -> List[str]:
    """Clean and parse skills from response"""
    cleanup_phrases = [
        "Here are the extracted skills as a comma-separated list:",
        "Here are the professional skills extracted",
        "Here are the skills:",
        "Skills:",
        "The skills are:"
    ]

    cleaned_text = text.strip()
    for phrase in cleanup_phrases:
        cleaned_text = cleaned_text.replace(phrase, "").strip()

    # Split by comma and clean each skill
    if ',' in cleaned_text:
        skills = [s.strip() for s in cleaned_text.split(',')]
    else:
        skills = [s.strip() for s in cleaned_text.split('\n')]

    # Filter and clean skills
    cleaned_skills = []
    for skill in skills:
        skill = skill.strip().strip('•').strip('-').strip()
        if (skill and len(skill) > 1 and len(skill) < 60 and
            not skill.lower().startswith(('here', 'the', 'extract'))):
            cleaned_skills.append(skill)

    return cleaned_skills[:30]

In [7]:
# @title Define the shared State schema for StateGraph
class State(TypedDict, total=False):
    # Original fields
    cv_text: NotRequired[str]
    candidate_name: NotRequired[str]
    job_text: NotRequired[str]
    job_description: NotRequired[str]
    company: NotRequired[str]
    title: NotRequired[str]
    applications_df: NotRequired[Any]  # pandas DataFrame
    skills_summary: NotRequired[str]
    cv_skills: NotRequired[list]
    job_skills: NotRequired[list]
    matched_skills: NotRequired[list]
    unmatched_skills: NotRequired[list]
    matching_score: NotRequired[int]
    history_note: NotRequired[str]
    cover_letter: NotRequired[str]
    cv_relevant_sections: NotRequired[str]  # Output from cv_section_extractor_node
    overall_skill_match: NotRequired[int]   # Output from skill_matching_node


In [8]:
# @title Agent functions

def extract_candidate_info(cv_text: str) -> str:
    """Extract candidate name and title from CV text"""
    if not cv_text:
        return "Name not found"

    system = "Extract the candidate's full name and professional title from this CV. Return in format: 'Name - Title'"
    prompt = f"Extract name and title from this CV:\n{cv_text[:1000]}"

    try:
        response = llm.generate(prompt, system_prompt=system, max_tokens=100)
        return response.strip()
    except:
        # Fallback: look for name patterns in first few lines
        lines = cv_text.split('\n')[:5]
        for line in lines:
            if len(line.strip()) > 5 and len(line.strip()) < 50:
                return line.strip()
        return "Name not found"


###############################################################################
def job_extraction_node(state: State) -> State:
    """Extract company, title, and description from job text"""
    job_text = state.get('job_text', '')

    system = '''Extract company name, job title, and clean description from the job posting.
    Return as JSON with keys: company, title, description. Return valid JSON only.'''

    prompt = f"Job posting text:\n{job_text[:8000]}"

    resp = llm.generate(prompt, system_prompt=system, max_tokens=800)

    # Try to parse JSON response
    try:
        parsed = json.loads(resp)
        return {
            'company': parsed.get('company', 'Unknown Company'),
            'title': parsed.get('title', 'Unknown Position'),
            'job_description': parsed.get('description', job_text[:2000])
        }
    except Exception:
        # Fallback to simple heuristics
        lines = job_text.split('\n')
        company = lines[0] if lines else 'Unknown Company'
        title = lines[1] if len(lines) > 1 else 'Unknown Position'
        description = '\n'.join(lines[2:]) if len(lines) > 2 else job_text[:2000]

        return {
            'company': company.strip(),
            'title': title.strip(),
            'job_description': description.strip()
        }


###############################################################################
def history_node(state: State) -> State:
    """Check application history for the company"""
    df = state.get('applications_df', pd.DataFrame())
    company = state.get('company', '')

    if df is None or df.empty:
        return {'history_note': 'No application history provided'}

    # Find company-like columns
    cols = [c for c in df.columns if isinstance(c, str)]
    company_cols = [c for c in cols if any(word in c.lower() for word in ['company', 'business', 'employer', 'organization'])]

    if not company_cols:
        company_cols = [cols[0]] if cols else []

    if not company_cols:
        return {'history_note': 'No company column found in application history'}

    company_col = company_cols[0]

    # Search for company matches
    try:
        matches = df[df[company_col].astype(str).str.contains(company, case=False, na=False)]

        if matches.empty:
            return {'history_note': f'No previous applications found for {company}'}
        else:
            recent = matches.head(3)
            history_text = f'Found {len(matches)} previous application(s) to {company}:\n'
            for _, row in recent.iterrows():
                date = row.get('Date', 'Unknown date')
                response = row.get('Response', 'No response recorded')
                history_text += f"- {date}: {response}\n"

            return {'history_note': history_text}
    except Exception as e:
        return {'history_note': f'Error checking application history: {str(e)}'}



###############################################################################
def writer_node(state: State) -> State:
    """Generate the cover letter using AIDA framework"""
    company = state.get('company', 'Unknown Company')
    title = state.get('title', 'Unknown Position')
    job_desc = state.get('job_description', '')[:1600]
    cv_text = state.get('cv_text', '')[:1600]
    matched_skills_raw = state.get('matched_skills', [])
    history = state.get('history_note', '')

    # FIX: Extract skill names from dictionaries
    matched_skills = []
    for skill_item in matched_skills_raw[:10]:  # Top 10 skills
        if isinstance(skill_item, dict):
            matched_skills.append(skill_item.get('skill', str(skill_item)))
        else:
            matched_skills.append(str(skill_item))

    system = '''You are an expert career consultant. Write a professional cover letter using the AIDA framework (Attention, Interest, Desire, Action).
    Keep it concise (3-4 paragraphs, 250-350 words). Make it personalized and compelling.'''

    prompt = f"""Write a cover letter for:
Position: {title}
Company: {company}

Job Description: {job_desc}

Candidate Background: {cv_text}

Key Matching Skills: {', '.join(matched_skills)}

Application History: {history}

Create a personalized, professional cover letter that highlights relevant experience and enthusiasm for the role."""

    response = llm.generate(prompt, system_prompt=system, max_tokens=1000)

    return {'cover_letter': response}


###############################################################################
def cv_section_extractor_node(state: State) -> State:
    """Extract relevant sections from CV for skills analysis"""
    cv_text = state.get('cv_text', '')

    system = """Extract the most relevant sections from this CV for skills analysis:
    1. Skills/Technical Skills section
    2. Work Experience section
    3. Education/Certifications section
    4. Soft Skills section

    Return only the text content from these sections, clearly labeled."""

    prompt = f"""Extract and return the following sections from this CV:

{cv_text}

Format as:
SKILLS SECTION:
[content]

WORK EXPERIENCE:
[content]

CERTIFICATIONS:
[content]

SOFT SKILLS:
[content]"""

    response = llm.generate(prompt, system_prompt=system, max_tokens=1500)

    return {'cv_relevant_sections': response}


###############################################################################
def job_skills_extractor_node(state: State) -> State:
    """Extract skills from job description"""
    job_desc = state.get('job_description', '')[:6000]

    system = """Extract 25-30 required professional skills from this job description. Focus on:
    1. Technical requirements (programming languages, tools, frameworks)
    2. Methodologies (Agile, DevOps, etc.)
    3. Domain expertise
    4. Soft skills requirements

    Return ONLY a clean comma-separated list. No introductory text."""

    prompt = f"""Extract required skills from this job description:

{job_desc}

Return exactly as: skill1, skill2, skill3, etc."""

    response = llm.generate(prompt, system_prompt=system, max_tokens=400)

    # Clean the response
    job_skills = clean_and_parse_skills(response)

    return {'job_skills': job_skills}


###############################################################################
def cv_skills_extractor_node(state: State) -> State:
    """Extract skills from relevant CV sections"""
    cv_sections = state.get('cv_relevant_sections', '')

    system = """Extract 40 - 50 professional skills from these CV sections. Look for:
    1. Explicitly listed skills
    2. Technologies mentioned in work experience
    3. Programming languages and tools
    4. Certifications and training
    5. Soft skills mentioned

    Return ONLY a clean comma-separated list. No introductory text."""

    prompt = f"""Extract skills from these relevant CV sections:

{cv_sections}

Return exactly as: skill1, skill2, skill3, etc."""

    response = llm.generate(prompt, system_prompt=system, max_tokens=800)

    # Clean the response
    cv_skills = clean_and_parse_skills(response)

    return {'cv_skills': cv_skills}


###############################################################################
def skill_matching_node(state: State) -> State:
    """Match job skills against CV sections with detailed analysis"""
    job_skills = state.get('job_skills', [])
    cv_sections = state.get('cv_relevant_sections', '')
    cv_skills = state.get('cv_skills', [])

    skills_str = "\n".join([f"- {s}" for s in job_skills])

    prompt = f"""You are an expert recruiter analyzing skill matches between job requirements and a candidate's CV.

JOB REQUIRED SKILLS:
{skills_str}

CANDIDATE'S RELEVANT CV SECTIONS:
{cv_sections}

CANDIDATE'S EXTRACTED SKILLS:
{', '.join(cv_skills)}

For each job skill, determine:
1. Does the candidate have this skill (based on CV evidence)?
2. Similarity score (0-100) based on direct mentions, related experience, or transferable skills
3. Brief reason explaining the match/mismatch

Return valid JSON array:
[
  {{"skill": "Python", "similarity": 95, "match": true, "reason": "Listed in skills section and used in multiple projects"}},
  {{"skill": "Kubernetes", "similarity": 30, "match": false, "reason": "No direct mention, but has Docker experience"}}
]"""

    try:
        response = llm.generate(prompt, max_tokens=1200)
        # Clean response to ensure valid JSON
        response = response.strip()
        if not response.startswith('['):
            # Extract JSON from response if there's extra text
            import re
            json_match = re.search(r'\[.*\]', response, re.DOTALL)
            if json_match:
                response = json_match.group(0)

        results = json.loads(response)

        matched = [r for r in results if r.get("match", False)]
        unmatched = [r for r in results if not r.get("match", False)]

        overall_match = round((len(matched) / len(job_skills)) * 100, 2) if job_skills else 0

        return {
            "matched_skills": matched,
            "unmatched_skills": unmatched,
            "overall_skill_match": overall_match
        }

    except Exception as e:
        print(f"Error in skill matching: {e}")
        return {
            "matched_skills": [],
            "unmatched_skills": job_skills,
            "overall_skill_match": 0
        }

In [9]:
# @title Build StateGraph for the Agents
builder = StateGraph(State)

# Add nodes
builder.add_node("job_extraction", job_extraction_node)
builder.add_node("cv_section_extraction", cv_section_extractor_node)
builder.add_node("job_skills_extraction", job_skills_extractor_node)
builder.add_node("cv_skills_extraction", cv_skills_extractor_node)
builder.add_node("skill_matching", skill_matching_node)
builder.add_node("history_check", history_node)
builder.add_node("cover_writer", writer_node)

# Define the workflow
builder.add_edge(START, "job_extraction")
builder.add_edge("job_extraction", "cv_section_extraction")
builder.add_edge("cv_section_extraction", "job_skills_extraction")
builder.add_edge("job_skills_extraction", "cv_skills_extraction")
builder.add_edge("cv_skills_extraction", "skill_matching")
builder.add_edge("skill_matching", "history_check")
builder.add_edge("history_check", "cover_writer")
builder.add_edge("cover_writer", END)

# Compile the graph
workflow = builder.compile()
print("LangGraph workflow compiled successfully")


LangGraph workflow compiled successfully


In [10]:
# @title MAIN PIPELINE
def run_pipeline(cv_file, excel_file, job_text_input):
    """Main processing pipeline"""
    try:
        print("🚀 Starting pipeline...")

        # Initialize state
        initial_state = State()

        # Process CV file
        if cv_file:
            cv_path = cv_file.name
            cv_text, cv_filename = FileProcessor.extract_text(cv_path)
            initial_state['cv_text'] = cv_text
            initial_state['candidate_name'] = cv_filename
            print(f"📄 CV processed: {len(cv_text)} characters from {cv_filename}")
        else:
            print("⚠️ No CV file provided")
            return "Error: No CV file provided", "", "", None

        # Process Excel file
        if excel_file:
            excel_path = excel_file.name
            df = FileProcessor.load_excel_data(excel_path)
            initial_state['applications_df'] = df
            print(f"📊 Excel processed: {len(df)} rows")
        else:
            initial_state['applications_df'] = pd.DataFrame()
            print("⚠️ No Excel file provided")

        # Set job text
        initial_state['job_text'] = job_text_input.strip()
        print(f"📝 Job text: {len(job_text_input)} characters")

        if not job_text_input.strip():
            return "Error: No job description provided", "", "", None

        # Run the workflow
        print("🔄 Running LangGraph workflow...")
        final_state = workflow.invoke(initial_state)

        # Extract results with error handling
        company = final_state.get('company', 'Unknown')
        title = final_state.get('title', 'Unknown')
        cv_filename = final_state.get('candidate_name', 'Unknown')
        overall_skill_match = final_state.get('overall_skill_match', 0)
        history_note = final_state.get('history_note', 'No history checked')

        # Safe extraction of skills data
        def safe_extract_skills(skill_data, field_name):
            """Safely extract skill names from various data formats"""
            print(f"DEBUG - Processing {field_name}: type={type(skill_data)}, length={len(skill_data) if skill_data else 0}")

            if not skill_data:
                return []

            extracted = []
            for i, item in enumerate(skill_data):
                print(f"DEBUG - {field_name}[{i}]: type={type(item)}, value={item}")

                if isinstance(item, dict):
                    # If it's a dict, look for common key names
                    skill_name = item.get('skill', item.get('name', str(item)))
                    extracted.append(skill_name)
                elif isinstance(item, str):
                    extracted.append(item)
                else:
                    extracted.append(str(item))

            print(f"DEBUG - {field_name} extracted: {extracted[:3]}")
            return extracted

        # Extract skill lists safely
        matched_skills = safe_extract_skills(final_state.get('matched_skills', []), 'matched')
        unmatched_skills = safe_extract_skills(final_state.get('unmatched_skills', []), 'unmatched')
        cv_skills_raw = final_state.get('cv_skills', [])
        job_skills_raw = final_state.get('job_skills', [])

        # Ensure cv_skills and job_skills are lists of strings
        cv_skills = [str(skill).strip() for skill in cv_skills_raw] if cv_skills_raw else []
        job_skills = [str(skill).strip() for skill in job_skills_raw] if job_skills_raw else []

        print(f"DEBUG - Final cv_skills: {cv_skills[:3]}")
        print(f"DEBUG - Final job_skills: {job_skills[:3]}")
        print(f"DEBUG - Final matched_skills: {matched_skills[:3]}")
        print(f"DEBUG - Final unmatched_skills: {unmatched_skills[:3]}")

        cover_letter = final_state.get('cover_letter', 'No cover letter generated')

        # Extract candidate name from CV
        try:
            candidate_name = extract_candidate_info(final_state.get('cv_text', ''))
        except:
            candidate_name = "Name not found"

        # Create summary
        summary = f"""📋 APPLICATION SUMMARY

📄 CV File: {cv_filename}
👤 Candidate: {candidate_name}
🏢 Company: {company}
💼 Position: {title}
📊 Skills Match: {overall_skill_match}%

📈 Application History:
{history_note}

🎯 Ready for application submission!"""

        # Create skills summary with safe joining
        try:
            skills_summary = f"""🎯 SKILLS ANALYSIS

✅ YOUR SKILLS ({len(cv_skills)}):
{', '.join(cv_skills) if cv_skills else 'None extracted'}

🎯 JOB REQUIREMENTS ({len(job_skills)}):
{', '.join(job_skills) if job_skills else 'None extracted'}

💪 MATCHED SKILLS ({len(matched_skills)}):
{', '.join(matched_skills) if matched_skills else 'None matched'}

⚠️ SKILLS TO DEVELOP ({len(unmatched_skills)}):
{', '.join(unmatched_skills[:20]) if unmatched_skills else 'None identified'}

📊 OVERALL MATCH: {overall_skill_match}% ({len(matched_skills)}/{len(job_skills)} skills)"""

        except Exception as e:
            print(f"ERROR creating skills_summary: {e}")
            skills_summary = f"Error creating skills summary: {str(e)}"

        print("✅ Pipeline completed successfully!")

        return summary, skills_summary, cover_letter, None

    except Exception as e:
        error_msg = f"❌ Pipeline error: {str(e)}"
        print(error_msg)
        import traceback
        print("Full traceback:")
        traceback.print_exc()
        return error_msg, "", "", None

# Helper function for candidate name extraction
def extract_candidate_info(cv_text: str) -> str:
    """Extract candidate name and title from CV text"""
    if not cv_text:
        return "Name not found"

    system = "Extract the candidate's full name and professional title from this CV. Return in format: 'Name - Title'"
    prompt = f"Extract name and title from this CV:\n{cv_text[:1000]}"

    try:
        response = llm.generate(prompt, system_prompt=system, max_tokens=100)
        return response.strip()
    except:
        # Fallback: look for name patterns in first few lines
        lines = cv_text.split('\n')[:5]
        for line in lines:
            if len(line.strip()) > 5 and len(line.strip()) < 50:
                return line.strip()
        return "Name not found"

## **INTERFACE**

In [11]:
# Gradio UI
def create_interface():
    """Create Gradio interface"""

    with gr.Blocks(title="🤖 AI Cover Letter Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🤖 AI Cover Letter Generator
        ### Powered by LangGraph & Groq API

        Upload your CV and job application history, then paste the job description to generate a personalized cover letter.
        """)

        with gr.Row():
            with gr.Column(scale=1):
                cv_upload = gr.File(
                    label="📄 Upload CV (PDF/DOCX/TXT)",
                    file_types=['.pdf', '.docx', '.txt']
                )

                excel_upload = gr.File(
                    label="📊 Upload Application History (XLSX)",
                    file_types=['.xlsx', '.xls']
                )

            with gr.Column(scale=2):
                job_input = gr.Textbox(
                    label="📝 Job Description",
                    placeholder="Paste the Company, Position and complete job description here...",
                    lines=12
                )

        generate_btn = gr.Button(
            "🚀 Generate Cover Letter",
            variant="primary",
            size="lg"
        )

        with gr.Row():
            summary_output = gr.Textbox(
                label="📋 Summary",
                lines=10,
                interactive=False
            )

            skills_output = gr.Textbox(
                label="🎯 Skills Analysis",
                lines=10,
                interactive=False
            )

        cover_output = gr.Textbox(
            label="✍️ Generated Cover Letter",
            lines=15,
            show_copy_button=True
        )

        download_output = gr.File(
            label="💾 Download Cover Letter",
            visible=False
        )

        # Event handler
        generate_btn.click(
            fn=run_pipeline,
            inputs=[cv_upload, excel_upload, job_input],
            outputs=[summary_output, skills_output, cover_output, download_output]
        )

    return demo


## ***MAIN EXECUTION***

In [12]:
demo = create_interface()
demo.launch(share=True, show_api=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4a300e4c6a3bf24033.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


