Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 99 additions & 69 deletions apps/discord_bot/src/five08/discord_bot/cogs/crm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from five08.discord_bot.config import settings
from five08.clients import espo
from five08.skills import normalize_skill_list
from five08.resume_extractor import ResumeExtractedProfile, ResumeProfileExtractor
from five08.discord_bot.utils.audit import DiscordAuditLogger
from five08.discord_bot.utils.role_decorators import (
require_role,
Expand Down Expand Up @@ -1107,6 +1108,12 @@ def __init__(self, bot: commands.Bot) -> None:
self.espo_api = EspoAPI(api_url, settings.espo_api_key)
# Store base URL for profile links
self.base_url = settings.espo_base_url.rstrip("/")
self.resume_extractor = ResumeProfileExtractor(
api_key=settings.openai_api_key,
base_url=settings.openai_base_url,
model=settings.openai_model,
)
self._resume_profile_cache: tuple[int, ResumeExtractedProfile] | None = None
self.audit_logger = DiscordAuditLogger(
base_url=settings.audit_api_base_url,
shared_secret=settings.api_shared_secret,
Expand Down Expand Up @@ -2883,52 +2890,51 @@ async def _create_migadu_mailbox(
except aiohttp.ClientError as exc:
raise ValueError(f"Migadu API request failed: {exc}") from exc

def _extract_resume_contact_hints(
self, file_content: bytes
) -> dict[str, list[str]]:
"""Extract basic contact-identifying signals from resume bytes."""
text = file_content.decode("utf-8", errors="ignore")
if not text:
return {"emails": [], "github_usernames": [], "linkedin_urls": []}

# Keep this lightweight; heuristics are only used for contact targeting.
snippet = text[:12000]
email_re = re.compile(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
flags=re.IGNORECASE,
)
github_re = re.compile(
r"(?:https?://)?(?:www\.)?github\.com/([A-Za-z0-9-]{1,39})",
flags=re.IGNORECASE,
)
linkedin_re = re.compile(
r"(?:https?://)?(?:[\w.-]+\.)?linkedin\.com/in/[A-Za-z0-9\\-_%]+/?",
flags=re.IGNORECASE,
)

email_matches: list[str] = []
for email in email_re.findall(snippet):
candidate = str(email).strip().lower()
if candidate and candidate not in email_matches:
email_matches.append(candidate)
def _extract_resume_contact_hints(self, file_content: bytes) -> dict[str, Any]:
"""Extract contact-identifying signals and shared resume fields from bytes."""
profile = self._extract_resume_profile(file_content)
return {
"emails": [profile.email] if profile.email else [],
"github_usernames": [profile.github_username]
if profile.github_username
else [],
"linkedin_urls": [profile.linkedin_url] if profile.linkedin_url else [],
"phone": profile.phone,
"name": profile.name,
"address_country": profile.address_country,
"seniority_level": profile.seniority_level,
"skills": profile.skills,
}

github_matches: list[str] = []
for username in github_re.findall(snippet):
candidate = str(username).strip().lower()
if candidate and candidate not in github_matches:
github_matches.append(candidate)
def _extract_resume_profile(self, file_content: bytes) -> Any:
"""Extract resume profile fields and cache per-file-content results."""
cache = self._resume_profile_cache
cache_key = hash(file_content)
if cache and cache[0] == cache_key:
return cache[1]

linkedin_matches: list[str] = []
for linkedin_url in linkedin_re.findall(snippet):
candidate = str(linkedin_url).strip().lower().rstrip("/")
if candidate and candidate not in linkedin_matches:
linkedin_matches.append(candidate)
text = file_content.decode("utf-8", errors="ignore")
profile = self.resume_extractor.extract(text)
self._resume_profile_cache = (cache_key, profile)
return profile
Comment on lines +2909 to +2919
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# Check ResumeProfileExtractor.extract() signature and if it's sync or async
find . -name "resume_extractor.py" -type f | head -5

Repository: 508-dev/508-workflows

Length of output: 113


🏁 Script executed:

# Search for ResumeProfileExtractor and extract method definition
rg -n "class ResumeProfileExtractor|def extract" packages/shared/src/five08/resume_extractor.py -A 2

Repository: 508-dev/508-workflows

Length of output: 349


🏁 Script executed:

# Find all calls to _extract_resume_profile in crm.py to understand the call chain
rg -n "_extract_resume_profile" apps/discord_bot/src/five08/discord_bot/cogs/crm.py -B 2 -A 2

Repository: 508-dev/508-workflows

Length of output: 631


🏁 Script executed:

# Check if the methods calling _extract_resume_profile are async
rg -n "async def upload_resume|async def _infer_contact_from_resume|async def _extract_resume_contact_hints" apps/discord_bot/src/five08/discord_bot/cogs/crm.py -B 1 -A 3

Repository: 508-dev/508-workflows

Length of output: 441


🏁 Script executed:

# Verify the current state of _extract_resume_profile matches the snippet
sed -n '2909,2919p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 562


🏁 Script executed:

# Find all callers of _extract_resume_contact_hints
rg -n "_extract_resume_contact_hints" apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 363


🏁 Script executed:

# Check the implementation of _infer_contact_from_resume to see what it calls
sed -n '3068,3150p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 3293


🏁 Script executed:

# Check upload_resume to see the call chain
sed -n '5006,5100p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 3770


🏁 Script executed:

# Check if upload_resume calls _infer_contact_from_resume
rg -n "_infer_contact_from_resume" apps/discord_bot/src/five08/discord_bot/cogs/crm.py -B 2 -A 2

Repository: 508-dev/508-workflows

Length of output: 565


🏁 Script executed:

# Verify the resume_extractor.extract implementation to confirm it does blocking I/O
sed -n '186,250p' packages/shared/src/five08/resume_extractor.py

Repository: 508-dev/508-workflows

Length of output: 2817


🏁 Script executed:

# Check the methods containing the other calls to _extract_resume_contact_hints (lines 2957 and 2967)
sed -n '2940,2980p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 1765


🏁 Script executed:

# Check if _extract_resume_name_hint or _build_resume_create_contact_payload are called from async functions
rg -n "_extract_resume_name_hint|_build_resume_create_contact_payload" apps/discord_bot/src/five08/discord_bot/cogs/crm.py -B 2 -A 2

Repository: 508-dev/508-workflows

Length of output: 1486


🏁 Script executed:

# Find the context around line 1024 - what class/method is this in
sed -n '1000,1030p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py

Repository: 508-dev/508-workflows

Length of output: 1386


🏁 Script executed:

# Find callers of _build_resume_link_contact_payload
rg -n "_build_resume_link_contact_payload" apps/discord_bot/src/five08/discord_bot/cogs/crm.py -B 2 -A 2

Repository: 508-dev/508-workflows

Length of output: 47


🏁 Script executed:

# Check what class contains confirm_create to understand the context better
sed -n '990,1015p' apps/discord_bot/src/five08/discord_bot/cogs/crm.py | head -30

Repository: 508-dev/508-workflows

Length of output: 1046


Synchronous extractor call can block the Discord event loop.

Line 2917 calls self.resume_extractor.extract(text) synchronously. This method makes a blocking OpenAI API call that stalls the Discord bot's event loop when invoked from async command handlers like upload_resume() and the confirm_create() button handler.

⚙️ Suggested fix direction
-    def _extract_resume_profile(self, file_content: bytes) -> Any:
+    async def _extract_resume_profile(self, file_content: bytes) -> ResumeExtractedProfile:
         """Extract resume profile fields and cache per-file-content results."""
         cache = self._resume_profile_cache
         cache_key = hash(file_content)
         if cache and cache[0] == cache_key:
             return cache[1]

         text = file_content.decode("utf-8", errors="ignore")
-        profile = self.resume_extractor.extract(text)
+        profile = await asyncio.to_thread(self.resume_extractor.extract, text)
         self._resume_profile_cache = (cache_key, profile)
         return profile
-    def _extract_resume_contact_hints(self, file_content: bytes) -> dict[str, Any]:
+    async def _extract_resume_contact_hints(self, file_content: bytes) -> dict[str, Any]:
         """Extract contact-identifying signals and shared resume fields from bytes."""
-        profile = self._extract_resume_profile(file_content)
+        profile = await self._extract_resume_profile(file_content)

Also update callers of _extract_resume_contact_hints to use await.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/discord_bot/src/five08/discord_bot/cogs/crm.py` around lines 2909 -
2919, The synchronous call to self.resume_extractor.extract(text) in
_extract_resume_profile blocks the event loop; change _extract_resume_profile
into an async method that offloads the blocking work to a thread executor (e.g.,
use asyncio.get_running_loop().run_in_executor or loop.run_in_executor) to call
self.resume_extractor.extract(text) without blocking, keep the existing caching
logic (cache_key, self._resume_profile_cache) but adapt it to async, and then
update all callers (e.g., upload_resume() and confirm_create() button handler
and any uses of _extract_resume_contact_hints) to await _extract_resume_profile
so the extractor runs without stalling the Discord event loop.


return {
"emails": email_matches,
"github_usernames": github_matches,
"linkedin_urls": linkedin_matches,
}
def _extract_resume_name_fallback(self, file_content: bytes) -> str:
"""Simple name heuristic fallback when extraction did not return a name."""
text = file_content.decode("utf-8", errors="ignore")
lines = [line.strip() for line in text.splitlines() if line.strip()]
for line in lines[:40]:
candidate = line.strip()
if not candidate:
continue
if len(candidate) < 2:
continue
if "@" in candidate or "http" in candidate.lower():
continue
if not any(char.isalpha() for char in candidate):
continue
if len(candidate.split()) >= 1 and len(candidate) <= 70:
return candidate
return "Unknown Contact"

def _format_inferred_attempts(self, attempts: list[dict[str, Any]] | None) -> str:
if not attempts:
Expand All @@ -2948,23 +2954,11 @@ def _format_inferred_attempts(self, attempts: list[dict[str, Any]] | None) -> st

def _extract_resume_name_hint(self, file_content: bytes) -> str:
"""Best-effort contact name extraction from resume text."""
text = file_content.decode("utf-8", errors="ignore")
lines = [line.strip() for line in text.splitlines() if line.strip()]
for line in lines[:40]:
candidate = line.strip()
if not candidate:
continue
if len(candidate) < 2:
continue
if "@" in candidate or "http" in candidate.lower():
continue
if not any(char.isalpha() for char in candidate):
continue
# Prefer short, title-like lines at the top as candidate names.
if len(candidate.split()) >= 1 and len(candidate) <= 70:
return candidate

return "Unknown Contact"
hints = self._extract_resume_contact_hints(file_content)
extracted_name = str(hints.get("name") or "").strip()
if extracted_name:
return extracted_name
return self._extract_resume_name_fallback(file_content)

def _build_resume_create_contact_payload(
self, file_content: bytes
Expand All @@ -2973,18 +2967,45 @@ def _build_resume_create_contact_payload(
hints = self._extract_resume_contact_hints(file_content)
name = self._extract_resume_name_hint(file_content)
contact_name = name if name != "Unknown Contact" else "Resume Candidate"
emails = hints.get("emails", [])
github_usernames = hints.get("github_usernames", [])
linkedin_urls = hints.get("linkedin_urls", [])
skills = hints.get("skills", [])
if not isinstance(emails, list):
emails = []
if not isinstance(github_usernames, list):
github_usernames = []
if not isinstance(linkedin_urls, list):
linkedin_urls = []
if not isinstance(skills, list):
skills = []

payload: dict[str, str] = {"name": contact_name}
if hints["emails"]:
primary_email = hints["emails"][0]
if emails:
primary_email = emails[0]
if primary_email.endswith("@508.dev"):
payload["c508Email"] = primary_email
else:
payload["emailAddress"] = primary_email
if hints["github_usernames"]:
payload["cGitHubUsername"] = hints["github_usernames"][0]
if hints["linkedin_urls"]:
payload["cLinkedInUrl"] = hints["linkedin_urls"][0]
if github_usernames:
payload["cGitHubUsername"] = github_usernames[0]
if linkedin_urls:
payload["cLinkedInUrl"] = linkedin_urls[0]
phone = hints.get("phone")
if isinstance(phone, str) and phone.strip():
payload["phoneNumber"] = phone.strip()
address_country = str(hints.get("address_country", "")).strip()
if address_country:
payload["addressCountry"] = address_country
seniority = str(hints.get("seniority_level", "")).strip()
if seniority:
payload["cSeniority"] = seniority
if skills:
normalized_skills = [
str(item).strip() for item in skills if str(item).strip()
]
if normalized_skills:
payload["skills"] = ", ".join(normalized_skills)

return payload

Expand Down Expand Up @@ -3050,7 +3071,10 @@ async def _infer_contact_from_resume(
"""Infer target contact from resume identifiers."""
hints = self._extract_resume_contact_hints(file_content)
attempts: list[dict[str, Any]] = []
for email in hints["emails"]:
emails = hints.get("emails", [])
if not isinstance(emails, list):
emails = []
for email in emails:
attempts.append({"method": "email", "value": email})
contacts = await self._search_contact_for_linking(email)
if len(contacts) == 1:
Expand All @@ -3067,7 +3091,10 @@ async def _infer_contact_from_resume(
"attempts": attempts,
}

for github_username in hints["github_usernames"]:
github_usernames = hints.get("github_usernames", [])
if not isinstance(github_usernames, list):
github_usernames = []
for github_username in github_usernames:
attempts.append({"method": "github", "value": github_username})
contacts = await self._search_contacts_by_field(
field="cGitHubUsername", value=github_username
Expand All @@ -3086,7 +3113,10 @@ async def _infer_contact_from_resume(
"attempts": attempts,
}

for linkedin_url in hints["linkedin_urls"]:
linkedin_urls = hints.get("linkedin_urls", [])
if not isinstance(linkedin_urls, list):
linkedin_urls = []
for linkedin_url in linkedin_urls:
attempts.append({"method": "linkedin", "value": linkedin_url})
contacts = await self._search_contacts_by_field(
field="cLinkedInUrl", value=linkedin_url
Expand Down
3 changes: 3 additions & 0 deletions apps/discord_bot/src/five08/discord_bot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ class Settings(SharedSettings):
migadu_api_user: str | None = None
migadu_api_key: str | None = None
migadu_mailbox_domain: str = "508.dev"
openai_api_key: str | None = None
openai_base_url: str | None = None
openai_model: str = "gpt-4o-mini"

# Kimai time tracking settings
kimai_base_url: str
Expand Down
8 changes: 6 additions & 2 deletions apps/worker/src/five08/worker/crm/intake_form_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import requests

from five08.clients.espo import EspoAPI, EspoAPIError
from five08.resume_extractor import ResumeProfileExtractor
from five08.worker.config import settings
from five08.worker.crm.document_processor import DocumentProcessor
from five08.worker.crm.resume_profile_processor import ResumeProfileExtractor
from five08.worker.crm.skills_extractor import SkillsExtractor
from five08.worker.masking import mask_email

Expand Down Expand Up @@ -74,7 +74,11 @@ def __init__(self) -> None:
api_url = settings.espo_base_url.rstrip("/") + "/api/v1"
self.api = EspoAPI(api_url, settings.espo_api_key)
self.document_processor = DocumentProcessor()
self.resume_extractor = ResumeProfileExtractor()
self.resume_extractor = ResumeProfileExtractor(
api_key=settings.openai_api_key,
base_url=settings.openai_base_url,
model=settings.resolved_resume_ai_model,
)
self.skills_extractor = SkillsExtractor()

def process_intake(self, *, payload: Mapping[str, Any]) -> dict[str, Any]:
Expand Down
Loading