In [26]:
# Warning control
import warnings
warnings.filterwarnings('ignore')
from crewai import Agent, Task, Crew
from crewai_tools import SerperDevTool, ScrapeWebsiteTool, FirecrawlCrawlWebsiteTool
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

In [None]:
import os
from utils import get_openai_api_key

openai_api_key = get_openai_api_key()

os.environ["OPENAI_API_KEY"] = openai_api_key 

In [None]:
# Create the LLM configuration
llm_config = {
    "api_key": openai_api_key,
    "model": "gpt-3.5-turbo",
    "temperature": 0.1,
    "max_tokens": 500
}

In [29]:
# Update initial input and run (replace existing run command)
user_details = {
    "user_name": "Diluksha Perera",
    "email": "dilukshakaushal@gmail.com",
    "educational_background": "KDU BS in Data science",
    "professional_background": "data Engineer at bistec",
    "skills": ["Python", "SQL"],
    "linkedin": "https://www.linkedin.com/in/DilukshaPerera",
    "github": "https://github.com/johndoe",
    "medium": "https://medium.com/@johndoe"
}

In [None]:
search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

'''linkedin = ScrapeWebsiteTool(
    website_url='https://www.linkedin.com/',
)
github = ScrapeWebsiteTool(
    website_url='https://github.com/',
)
medium = ScrapeWebsiteTool(
    website_url='https://medium.com/',
)'''


firegit= FirecrawlCrawlWebsiteTool(website=f"https://www.github.com/{user_details['github']}")
firelink = FirecrawlCrawlWebsiteTool(website=f"https://www.linkedin.com/in/{user_details['linkedin']}")
firemedium = FirecrawlCrawlWebsiteTool(website=f"https://www.medium.com/@{user_details['medium']}")

In [32]:
# Define Data Collector Agent
profile_finder = Agent(
    role="User Information Collection Specialist",
    goal=f"Gather accurate and comprehensive initial user information for {user_details['user_name']} through web form interface",
    backstory="You're an expert data collection specialist with years of experience in user onboarding and "
              "information gathering. Your meticulous attention to detail ensures that all essential user "
              "information is captured correctly the first time. You understand the importance of building "
              "user trust through transparent data collection practices and clear communication.",
    instructions=[
        f"1. Present a clean, user-friendly interface asking for {user_details['user_name']}'s name, {user_details['email']}, {user_details['educational_background']} , {user_details['professional_background']} , and skil{user_details['skills']}s.",
        "2. Validate all input fields for proper formatting and completeness.",
        "3. Clearly explain how the collected information will be used.", 
        "4. Store the collected information securely for use by other agents.",
        "5. Ensure all information is properly structured for downstream analysis."
    ],
    tools=[search_tool, scrape_tool],
    allow_delegation=False,
    verbose=True
)

In [33]:
web_researcher = Agent(
    role="General Web Research Specialist",
    goal=f"Discover and collect all publicly available information about {user_details['user_name']} through general web searches",
    backstory="You're a digital detective with unparalleled skills in uncovering information across the internet. "
              "With your background in OSINT (Open Source Intelligence), you've developed techniques to efficiently "
              "filter through vast amounts of online data to find relevant information about individuals. Your "
              "research provides the foundational context that other specialists build upon.",
    instructions=[
        f"1. Conduct comprehensive web searches based on {user_details['user_name']}'s information.",
        "2. Focus searches on professional, educational, and skill-related information.",
        "3. Prioritize credible sources and verify information across multiple sites when possible.",
        "4. Structure the collected information in a standardized format.",
        "5. Flag any inconsistencies or contradictions in discovered information.",
        "6. Provide confidence scores for each piece of information discovered."
    ],
    tools=[search_tool, scrape_tool],
    allow_delegation=False,
    verbose=True
)

In [34]:
linkedin_specialist = Agent(
    role="LinkedIn Profile Analysis Expert",
    goal=f"Extract and analyze complete professional information from {user_details['user_name']}'s LinkedIn presence at {user_details['linkedin']}",
    backstory="You've spent your career analyzing professional networks and digital career footprints. Having worked "
              "in talent acquisition and professional network analysis, you understand how to interpret LinkedIn "
              "profiles to identify career trajectories, skill development patterns, and professional accomplishments. "
              "Your expertise helps create comprehensive professional profiles beyond what's immediately visible.",
    instructions=[
        f"1. Access {user_details['user_name']}'s LinkedIn profile at {user_details['linkedin']} using appropriate scraping tools or APIs.",
        f"2. Extract {user_details['user_name']}'s complete work history, including job titles, companies, durations, and responsibilities.",
        f"3. Collect {user_details['user_name']}'s all educational credentials including degrees, institutions, and graduation dates.",
        f"4. Gather {user_details['user_name']}'s certifications, projects, research publications, and other professional achievements.",
        f"5. Analyze {user_details['user_name']}'s connections and endorsements to identify key skill areas.",
        "6. Document any articles, posts, or comments that demonstrate thought leadership.",
        "7. Organize the information chronologically and by relevance to create a comprehensive professional narrative."
    ],
    tools=[search_tool, scrape_tool,firelink],
    allow_delegation=False,
    verbose=True
)


In [35]:
github_analyst = Agent(
    role="GitHub Repository and Code Analysis Specialist",
    goal="Analyze the user's GitHub presence to determine technical capabilities, coding patterns, and project history",
    backstory="As a veteran software engineer and code reviewer, you've developed a keen eye for recognizing coding styles, "
              "technical capabilities, and development patterns. Having reviewed thousands of repositories across different "
              "domains, you can quickly assess a developer's technical proficiency, preferred technologies, and coding best "
              "practices through their GitHub contributions.",
    instructions=[
        "1. Access the user's GitHub profile and all public repositories.",
        "2. Analyze repository metadata (stars, forks, watchers, issues, pull requests).",
        "3. Identify primary programming languages and technology stacks used.",
        "4. Evaluate code quality, documentation practices, and adherence to best practices.",
        "5. Assess contribution frequency, commit patterns, and project maintenance.",
        "6. Analyze README files and documentation for communication clarity.",
        "7. Review issues and pull requests for collaboration style and problem-solving approach.",
        "8. Identify key projects that demonstrate technical expertise and development philosophy."
    ],
    tools=[search_tool, scrape_tool, firegit],
    allow_delegation=False,
    verbose=True
)

In [36]:
content_analyst = Agent(
    role="Medium Content and Technical Writing Expert",
    goal="Discover and analyze the user's written content to identify expertise areas and thought leadership",
    backstory="You've built your career analyzing technical content and identifying subject matter expertise from "
              "written work. With a background in content strategy and technical publishing, you can quickly identify "
              "an author's knowledge domains, communication style, and thought leadership based on their published "
              "articles. Your analysis reveals expertise that might not be explicitly stated in professional profiles.",
    instructions=[
        "1. Locate all Medium articles and blog posts authored by the user.",
        "2. Analyze article topics, categories, and tags to identify primary focus areas.",
        "3. Assess writing style, technical depth, and communication clarity.",
        "4. Identify recurring themes and expertise domains across multiple articles.",
        "5. Evaluate audience engagement metrics (claps, comments, shares).",
        "6. Extract key technologies, methodologies, or frameworks mentioned in articles.",
        "7. Create a content timeline to track evolution of expertise and interests.",
        "8. Compare content focus areas with stated skills and professional background."
    ],
    tools=[search_tool, scrape_tool],
    allow_delegation=False,
    verbose=True
)


In [37]:
data_synthesizer = Agent(
    role="Cross-Source Data Integration Analyst",
    goal="Integrate and analyze all collected user data to create a unified, validated user profile",
    backstory="You're a data synthesis expert with extensive experience in cross-referencing information from "
              "multiple sources to create cohesive user profiles. Your background in data science and pattern "
              "recognition allows you to identify connections between disparate pieces of information and resolve "
              "conflicts between data sources. Your work transforms raw data collections into meaningful insights "
              "about individuals.",
    instructions=[
        "1. Consolidate all information collected by previous agents (initial form, general web search, LinkedIn, GitHub, Medium).",
        "2. Cross-reference data points across sources to validate accuracy and resolve contradictions.",
        "3. Identify patterns and connections between professional history, technical skills, and content creation.",
        "4. Create a timeline of skill development and career progression.",
        "5. Detect any gaps or inconsistencies that require further investigation.",
        "6. Assign confidence scores to each consolidated data point.",
        "7. Generate insights about the user's expertise evolution, skill specializations, and professional focus areas.",
        "8. Prepare a comprehensive profile with validated information ready for structured formatting."
    ],
    tools=[search_tool, scrape_tool],
    allow_delegation=False,
    verbose=True
)

In [38]:
persona_generator = Agent(
    role="User Persona Creation and JSON Formatting Specialist",
    goal="Generate a comprehensive, structured JSON representation of the user's complete professional persona",
    backstory="You have specialized in translating complex user data into structured, machine-readable formats for "
              "AI systems and databases. With your background in data architecture and user profiling, you excel at "
              "organizing information into logical hierarchies that capture the full spectrum of an individual's "
              "professional identity. Your JSON schemas are renowned for their comprehensiveness and utility.",
    instructions=[
        "1. Transform the synthesized user profile into a standardized JSON structure.",
        "2. Ensure all key categories are properly represented (personal info, education, professional experience, skills, etc.).",
        "3. Organize information hierarchically with appropriate nesting and relationships.",
        "4. Include metadata such as confidence scores and information sources.",
        "5. Follow consistent naming conventions and data formatting throughout the JSON structure.",
        "6. Verify schema completeness and structural integrity before finalizing.",
        "7. Generate a human-readable summary of the key persona attributes alongside the JSON output.",
        "8. Format the JSON output for maximum readability and usability by downstream systems."
    ],
    tools=[search_tool, scrape_tool],
    allow_delegation=False,
    verbose=True
)

# tasks

In [39]:
task_initial_data_collection = Task(
    description=(
        "1. Present a clean, user-friendly form interface for data collection.\n"
        "2. Collect and validate user's full name, email address, educational background.\n"
        "3. Gather professional background summary and key skills.\n"
        "4. Validate all input fields for proper formatting.\n"
        "5. Explain data usage transparently to build user trust.\n"
        "6. Format and sanitize collected data for downstream processing."
    ),
    expected_output=(
        "A structured dictionary containing:\n"
        "- First and last name (as separate fields)\n"
        "- Validated email address\n"
        "- Educational background summary\n"
        "- Professional background summary\n"
        "- Self-reported skills list (array format)\n"
        "- All fields properly sanitized and formatted"
    ),
    tools=[search_tool, scrape_tool],
    agent=profile_finder
)

In [40]:
task_web_search = Task(
    description=(
        "1. Conduct comprehensive web searches about the user combining name with professional keywords.\n"
        "2. Discover professional profiles, academic publications, and conference appearances.\n"
        "3. Find company mentions, news articles, and other relevant information.\n"
        "4. Cross-reference findings with user-provided details for validation.\n"
        "5. Document all discovered web mentions and their sources.\n"
        "6. Evaluate confidence levels for each piece of information found."
    ),
    expected_output=(
        "A detailed report containing:\n"
        "- Web mentions categorized by source type (news, academic, professional)\n"
        "- Links to relevant webpages with user mentions\n"
        "- Summary of discovered education, professional history, and skills\n"
        "- Confidence scores for each information piece\n"
        "- Discrepancies between user-provided and discovered information\n"
        "- Additional information sources for further investigation"
    ),
    tools=[search_tool, scrape_tool],
    agent=web_researcher,
    dependencies=[task_initial_data_collection]
)

In [41]:
task_linkedin_analysis = Task(
    description=(
        "1. Access and analyze the user's LinkedIn profile using appropriate scraping tools.\n"
        "2. Extract complete work history with positions, companies, and durations.\n"
        "3. Collect educational credentials and certifications.\n"
        "4. Gather skills, endorsements, and recommendations.\n"
        "5. Extract projects, publications, patents, and professional work.\n"
        "6. Analyze connection patterns and industry distribution.\n"
        "7. Collect authored content including articles and posts.\n"
        "8. Document awards, honors, and recognitions."
    ),
    expected_output=(
        "A comprehensive LinkedIn analysis containing:\n"
        "- Chronological work history with company details\n"
        "- Educational credentials and certifications\n"
        "- Skills and endorsements with metrics\n"
        "- Projects and publications\n"
        "- Professional recommendations\n"
        "- Authored content analysis\n"
        "- Network analysis and industry focus\n"
        "- Professional achievements and awards"
    ),
    tools=[search_tool, scrape_tool,firelink],
    agent=linkedin_specialist,
    dependencies=[task_initial_data_collection]
)

In [42]:
task_github_analysis = Task(
    description=(
        "1. Access and analyze all public repositories in user's GitHub profile.\n"
        "2. Extract repository metadata including stars, forks, watchers, and contributors.\n"
        "3. Identify primary programming languages and technology stacks with usage percentages.\n"
        "4. Evaluate code quality, organization, documentation, and best practices.\n"
        "5. Analyze contribution patterns including frequency and time distribution.\n"
        "6. Review collaboration patterns from issues, pull requests, and code reviews.\n"
        "7. Categorize projects by type and domain focus.\n"
        "8. Document notable technical achievements and innovative solutions.\n"
        "9. Track evolution of technical skills based on repository timeline."
    ),
    expected_output=(
        "A comprehensive GitHub analysis containing:\n"
        "- Repository list with complete metadata\n"
        "- Technology stack analysis with usage metrics\n"
        "- Code quality assessment report\n"
        "- Contribution pattern analysis\n"
        "- Collaboration style evaluation\n"
        "- Project categorization and domain expertise\n"
        "- Technical achievement highlights\n"
        "- Skill evolution timeline\n"
        "- Example code snippets demonstrating expertise"
    ),
    tools=[search_tool, scrape_tool, firegit],
    agent=github_analyst,
    dependencies=[task_initial_data_collection]
)

In [43]:
task_medium_analysis = Task(
    description=(
        "1. Discover and analyze all content published by the user on Medium and other platforms.\n"
        "2. Identify and catalog all articles authored by the user.\n"
        "3. Analyze topics, technical depth, and writing style of each article.\n"
        "4. Track audience engagement metrics where available.\n"
        "5. Identify recurring themes and technical focus areas.\n"
        "6. Assess thought leadership indicators and expertise evolution.\n"
        "7. Compare content focus with stated professional background.\n"
        "8. Document technologies and methodologies frequently discussed.\n"
        "9. Extract notable quotes demonstrating expertise.\n"
        "10. Create timeline of expertise development through content."
    ),
    expected_output=(
        "A comprehensive content analysis report containing:\n"
        "- Complete article catalog with metadata\n"
        "- Topic categorization and expertise domains\n"
        "- Technical depth assessment by domain\n"
        "- Writing style analysis and engagement metrics\n"
        "- Evolution of expertise timeline\n"
        "- Key technologies and frameworks coverage\n"
        "- Notable demonstrations of thought leadership\n"
        "- Alignment with professional background\n"
        "- Sources and confidence scores for findings"
    ),
    tools=[search_tool, scrape_tool],
    agent=content_analyst,
    dependencies=[task_initial_data_collection]
)

In [44]:
task_build_persona = Task(
    description=(
        "1. Collect and organize all analyzed data into defined categories.\n"
        "2. Structure information into Personal Information section.\n"
        "3. Organize Educational Background details chronologically.\n"
        "4. Format Professional Experience with company details and achievements.\n"
        "5. Categorize and rank Skills with proficiency levels.\n"
        "6. Document Coding Behavior patterns and preferences.\n"
        "7. Summarize Writing Topics and expertise areas.\n"
        "8. Convert the organized information into a structured JSON format."
    ),
    expected_output=(
        "A comprehensive JSON string containing:\n"
        "- Personal Information (name, contact, social profiles)\n"
        "- Educational Background with timeline\n"
        "- Professional Experience with detailed history\n"
        "- Skills categorized by domain and proficiency\n"
        "- Coding Behavior analysis and patterns\n"
        "- Writing Topics and thought leadership areas\n"
        "- All data properly formatted and nested"
    ),
    tools=[search_tool, scrape_tool],
    agent=persona_generator,
    dependencies=[task_initial_data_collection, task_web_search, task_linkedin_analysis, task_github_analysis, task_medium_analysis]
)

In [45]:
task_persona_generation = Task(
    description=(
        "1. Structure all collected and validated information into standardized JSON schema.\n"
        "2. Format personal information including name and contact details.\n"
        "3. Organize educational experiences chronologically.\n"
        "4. Structure professional experiences with detailed company information.\n"
        "5. Categorize skills inventory with proficiency levels.\n"
        "6. Document significant projects and technologies used.\n"
        "7. Include publications and written works.\n"
        "8. Add professional certifications and credentials.\n"
        "9. Structure online presence data including GitHub and Medium profiles.\n"
        "10. Categorize expertise areas with confidence levels."
    ),
    expected_output=(
        "A complete JSON document containing:\n"
        "- Personal information (name, contact, profiles)\n"
        "- Educational background with timeline\n"
        "- Professional experience with detailed history\n"
        "- Categorized skills with proficiency levels\n"
        "- Projects and publications\n"
        "- Online presence analysis\n"
        "- Expertise areas with confidence scores\n"
        "- Generation metadata and sources"
    ),
    tools=[search_tool, scrape_tool],
    agent=persona_generator,
    dependencies=[task_initial_data_collection, task_web_search, task_linkedin_analysis, 
                 task_github_analysis, task_medium_analysis, task_build_persona]
)

In [46]:
# Set up the crew (replace existing crew setup)
crew = Crew(
    agents=[profile_finder, web_researcher, linkedin_specialist, github_analyst, content_analyst, data_synthesizer, persona_generator],
    tasks=[task_initial_data_collection, task_web_search, task_linkedin_analysis, 
           task_github_analysis, task_medium_analysis, task_build_persona, task_persona_generation],
    verbose=True,
    #memory=True,
)



In [47]:
result = crew.kickoff(user_details)
print(result)

[1m[95m# Agent:[00m [1m[92mUser Information Collection Specialist[00m
[95m## Task:[00m [92m1. Present a clean, user-friendly form interface for data collection.
2. Collect and validate user's full name, email address, educational background.
3. Gather professional background summary and key skills.
4. Validate all input fields for proper formatting.
5. Explain data usage transparently to build user trust.
6. Format and sanitize collected data for downstream processing.[00m


[1m[95m# Agent:[00m [1m[92mUser Information Collection Specialist[00m
[95m## Using tool:[00m [92mSearch the internet with Serper[00m
[95m## Tool Input:[00m [92m
"{\"search_query\": \"Diluksha Perera educational background professional summary key skills\"}"[00m
[95m## Tool Output:[00m [92m
{'searchParameters': {'q': 'Diluksha Perera educational background professional summary key skills', 'type': 'search', 'num': 10, 'engine': 'google'}, 'organic': [{'title': 'Diluksha Perera - Data Engin

In [48]:
from IPython.display import Markdown as md

md(result.raw)

The comprehensive JSON representation of Diluksha Perera's professional persona is provided above.

In [51]:
import json

try:
    # Get the JSON content from the task_build_persona output
    json_content = [task.raw for task in result.tasks_output if task.agent == 'User Persona Creation and JSON Formatting Specialist'][0]
    
    # Extract only the JSON part by finding content between the first { and last }
    start_idx = json_content.find('{')
    end_idx = json_content.rfind('}') + 1
    clean_json = json_content[start_idx:end_idx]
    
    # Parse and save as JSON file
    with open('openai_gap1.json', 'w') as f:
        json.dump(json.loads(clean_json), f, indent=2)
    print("JSON file saved successfully!")
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")
    print("Raw content:", json_content)


JSON file saved successfully!
