In [21]:
import os
import re
import pathlib as Path
from dotenv import load_dotenv
from typing import List, Literal, Optional
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
import pickle

In [2]:

def extract_latex_dependencies(main_file_path):
    dependencies = []
    base_dir = os.path.dirname(main_file_path)
    
    try:
        with open(main_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        return {"error": f"File not found at {main_file_path}"}

    # Regex captures: command, arg1, optional arg2
    # Matches: \resumesection{Name}, \input{file}, \subimport{dir}{file}
    pattern = re.compile(r'\\(input|include|subimport|resumesection)\{([^}]+)\}(?:\{([^}]+)\})?')
    
    current_section = "Uncategorized" # Default if no section is found before import

    for match in pattern.finditer(content):
        command = match.group(1)
        arg1 = match.group(2).strip()
        arg2 = match.group(3).strip() if match.group(3) else None

        # Logic: If it's a section header, update state. If it's an import, save entry.
        if command == 'resumesection':
            current_section = arg1
        else:
            entry = {
                "section": current_section,
                #"file_name": None,
                "full_path": None
            }

            if command == 'subimport' and arg2:
                combined_path = os.path.join(base_dir, arg1, arg2)
                entry["full_path"] = os.path.normpath(combined_path)
            else:
                combined_path = os.path.join(base_dir, arg1)
                entry["full_path"] = os.path.normpath(combined_path)

            if not entry["full_path"].endswith('.tex'):
                entry["full_path"] += ".tex"
                
            #entry["file_name"] = Path(entry["full_path"]).stem
                
            dependencies.append(entry)

    return dependencies

In [4]:

# --- Schema 1: For files where we ALREADY know the section ---
class TitleOnly(BaseModel):
    title: str = Field(
        ..., 
        description="A concise, identity-focused title for this file's content (e.g., 'AI Researcher - Univ of Isfahan'). Do not include the section header."
    )

# --- Schema 2: For 'Uncategorized' files where we need BOTH ---
class TitleAndCategory(BaseModel):
    title: str = Field(
        ..., 
        description="A concise, identity-focused title for this file's content."
    )
    category: Literal["CONTACT_INFORMATION","PROFILE", "WORK_EXPERIENCE", "EDUCATION", "PROJECTS", "SKILLS", "OTHER"] = Field(
        ...,
        description="The standardized category of this section determined from the content."
    )

def enrich_file_metadata(file_list: List[dict], gemini_api_key: str = None) -> List[dict]:
    
    # 1. API Key Setup
    if not gemini_api_key: 
        load_dotenv()
        gemini_api_key = os.getenv("GOOGLE_API_KEY")
        if not gemini_api_key:
            raise ValueError("GEMINI API KEY not found")
            
    # 2. Initialize Model
    # Note: 'gemini-1.5-flash' is the standard lightweight model name. 
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0,
        api_key=gemini_api_key
    )
    
    # 3. Create TWO distinct chains
    
    # Chain A: For known sections (extracts Title only)
    structured_llm_title = llm.with_structured_output(TitleOnly)
    prompt_title = ChatPromptTemplate.from_messages([
        ("system", "You are a resume parser. Extract a concise title from the LaTeX content."),
        ("human", "Context: Section is known as '{section}'.\n\nRaw Content:\n{content}")
    ])
    chain_title = prompt_title | structured_llm_title

    # Chain B: For Uncategorized sections (extracts Title + Category)
    structured_llm_full = llm.with_structured_output(TitleAndCategory)
    prompt_full = ChatPromptTemplate.from_messages([
        ("system", "You are a resume parser. The section is unknown. Analyze content to determine the Category and Title."),
        ("human", "Context: Section is Uncategorized.\n\nRaw Content:\n{content}")
    ])
    chain_full = prompt_full | structured_llm_full

    enriched_list = []
    print(f"Processing {len(file_list)} files...")

    for entry in file_list:
        full_path = entry.get("full_path")
        current_section = entry.get("section", "Uncategorized")
        
        # --- Read Content ---
        if full_path and os.path.exists(full_path):
            try:
                with open(full_path, "r", encoding="utf-8") as f:
                    raw_content = f.read()
            except Exception as e:
                print(f"Error reading {full_path}: {e}")
                raw_content = ""
        else:
            raw_content = ""
            
        if not raw_content.strip():
            entry["title"] = "Empty File"
            enriched_list.append(entry)
            continue

        # --- Conditional Logic ---
        try:
            # New dictionary to avoid mutation
            new_entry = entry.copy()

            if current_section == "Uncategorized":
                # PATH 1: Use Full Chain (Get Title + Category)
                response = chain_full.invoke({
                    "content": raw_content[:3000]
                })
                new_entry["title"] = response.title
                new_entry["category"] = response.category
                # We update the section key to match the found category
                new_entry["section"] = response.category 
                print(f"Filled Uncategorized -> [{response.category}] {response.title}")

            else:
                # PATH 2: Use Title Chain (Get Title Only)
                response = chain_title.invoke({
                    "section": current_section,
                    "content": raw_content[:3000]
                })
                new_entry["title"] = response.title
                # We keep the existing 'section' and do NOT add 'category' key if not needed
                # (Or you can map new_entry["category"] = current_section if you want consistency)
                print(f"Processed Known -> {response.title}")
            
            enriched_list.append(new_entry)
            
        except Exception as e:
            print(f"LLM Error on {full_path}: {e}")
            entry["error"] = str(e)
            enriched_list.append(entry)

    return enriched_list

In [5]:

path = "/Users/Bardia/Rsm/Resume_Bardia_Azami/resume-general/Bardia-Azami-Resume.tex"
os.path.exists(path)

True

In [14]:
dep = extract_latex_dependencies(path)

In [7]:
dep_comp = enrich_file_metadata(dep)
print(dep_comp)

Processing 12 files...
Filled Uncategorized -> [PROFILE] Contact Information
Filled Uncategorized -> [PROFILE] Professional Summary
Processed Known -> Qualifications Summary
Processed Known -> Computer Vision Engineer - Behyar Sanaat Sepahan
Processed Known -> AI Researcher - Univ of Isfahan


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash
Please retry in 45.228732333s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 5


Processed Known -> Machine Learning Engineer - Ottawa Housing Demand Analysis


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash
Please retry in 43.122599293s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 5


Processed Known -> AI Agent Developer – RAG-Based QA System (WW2 Dataset)


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash
Please retry in 30.899107966s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 5


Processed Known -> Robotics Engineer - Algonquin College
Processed Known -> Technical Skills
Processed Known -> AI & Software Developer - Algonquin College


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash
Please retry in 13.052678811s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 5


Processed Known -> First place at Sharif DataDays 2022


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash
Please retry in 10.951690194s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 5


LLM Error on /Users/Bardia/Rsm/Resume_Bardia_Azami/components/awards/Torob.tex: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash
Please retry in 7.079537883s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 20
}
, retry_delay {
  seconds: 7
}
]
[{'section': 'PROFILE

In [11]:
dep_comp[0]["section"] = "CONTACT_INFORMATION"

In [12]:
dep_comp

[{'section': 'CONTACT_INFORMATION',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/components/background.tex',
  'title': 'Contact Information',
  'category': 'PROFILE'},
 {'section': 'PROFILE',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/components/Profile.tex',
  'title': 'Professional Summary',
  'category': 'PROFILE'},
 {'section': 'HIGHLIGHT OF QUALIFICATIONS',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/components/Qualifications/QualificationsHighlight.tex',
  'title': 'Qualifications Summary'},
 {'section': 'TECHNICAL EXPERIENCE',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/components/experiences/BSS.tex',
  'title': 'Computer Vision Engineer - Behyar Sanaat Sepahan'},
 {'section': 'TECHNICAL EXPERIENCE',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/components/experiences/Researcher-UI.tex',
  'title': 'AI Researcher - Univ of Isfahan'},
 {'section': 'TECHNICAL EXPERIENCE',
  'full_path': '/Users/Bardia/Rsm/Resume_Bardia_Azami/compon

In [25]:
cach_dir = "/Users/Bardia/Coding/LangChain/Code_Rsm_Adj/cache"
os.path.exists(cach_dir)

True

In [27]:

resume_metadata = dep_comp
file_path = os.path.join(cach_dir, "resume_metadata.pkl")

with open(file_path, "wb") as f:
    pickle.dump(resume_metadata, f)