In [98]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.linkedin.com/feed/update/urn:li:activity:7275850295551635456/")
loader.requests_kwargs = {'verify':False}


In [99]:
docs = loader.load()

text=docs[0]
# write the page context in a.exe file
with open("a.exe", "w") as f:
    f.write(text.page_content)



In [100]:
import dotenv
import os

dotenv.load_dotenv(dotenv.find_dotenv())
groq_api_key = os.environ['groq_api_key']   

In [42]:
from langchain_openai import ChatOpenAI

llama3 = ChatOpenAI(api_key=groq_api_key, 
                    base_url="https://api.groq.com/openai/v1",
                    model="llama-3.3-70b-versatile",
                   )

llama3

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000021184504CA0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000021184506590>, root_client=<openai.OpenAI object at 0x000002118433F790>, root_async_client=<openai.AsyncOpenAI object at 0x0000021184504CD0>, model_name='llama-3.3-70b-versatile', model_kwargs={}, openai_api_key=SecretStr('**********'), openai_api_base='https://api.groq.com/openai/v1')

In [92]:
from typing import Optional, List
from langchain_core.pydantic_v1 import BaseModel, Field

class EmailId(BaseModel):
    Company_name: str = Field(..., description="Name of the company whoose email are availabe")
    Email: str = Field(..., description="Only all the available Emails ids of the company")
    
class EmailIdResponse(BaseModel):
    data: List[EmailId] = Field(..., description="List of companies(only whose email are present) and their email addresses")


In [93]:
structured_llama3 = llama3.with_structured_output(EmailIdResponse)

In [94]:
structured_llama3

RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000021184504CA0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000021184506590>, root_client=<openai.OpenAI object at 0x000002118433F790>, root_async_client=<openai.AsyncOpenAI object at 0x0000021184504CD0>, model_name='llama-3.3-70b-versatile', model_kwargs={}, openai_api_key=SecretStr('**********'), openai_api_base='https://api.groq.com/openai/v1'), kwargs={'tools': [{'type': 'function', 'function': {'name': 'EmailIdResponse', 'description': '', 'parameters': {'type': 'object', 'properties': {'data': {'description': 'List of companies(only whose email are present) and their email addresses', 'type': 'array', 'items': {'type': 'object', 'properties': {'Company_name': {'description': 'Name of the company whoose email are availabe', 'type': 'string'}, 'Email': {'description': 'Only all the available Emails ids of the company', 'type': 'string'}}, 'required

In [95]:
response=structured_llama3.invoke(str(text.page_content[:4000]))

In [96]:
for company_email in response.data:
    print(f"Company: {company_email.Company_name}")
    print(f"Email: {company_email.Email}")
    print("-" * 30)

Company: DLytica
Email: talent.hr@dlytica.com
------------------------------
Company: XenelSoft Technologies Pvt Ltd
Email: niharika.chaturvedi@xenelsoft.com
------------------------------
Company: XenelSoft Technologies Pvt Ltd
Email: janvi.verma@xenelsoft.com
------------------------------
Company: InfoEdge
Email: riya@naukri.com
------------------------------
Company: Genpact
Email: Mohit.Prasad@genpact.com
------------------------------
Company: TechTorch
Email: hr@techtorch.com
------------------------------
Company: Ooliga
Email: preeti@ooliga.com
------------------------------
Company: Abita
Email: k.prasanna@abits.co.in
------------------------------
Company: Orage Dogital
Email: ankita.sharma@offee.in
------------------------------
Company: InheritX Solutions Pvt Ltd
Email: priya.b@inheritx.com
------------------------------
Company: Kamoro Maxima Integra
Email: recruitment@kamoro.com
------------------------------
Company: ApniBus
Email: tanu.mishra@apnibus.com
--------------

In [None]:
len(str(text.page_content))

13482

In [50]:
page_content = text.page_content[:4000]

In [51]:
page_content

"\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBhavesh Arora on LinkedIn: Exciting Opportunities for Freshers 🚀\n\nCompany: DLytica - Data Analytics… | 19 comments\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n              Agree & Join LinkedIn\n            \n\n      By clicking Continue to join or sign in, you agree to LinkedIn’s User Agreement, Privacy Policy, and Cookie Policy.\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\nLinkedIn\n \n\n\n\n\n\n\n\n\n        Articles\n      \n\n\n\n\n\n\n\n        People\n      \n\n\n\n\n\n\n\n        Learning\n      \n\n\n\n\n\n\n\n        Jobs\n      \n\n\n\n\n\n\n\n        Games\n      \n\n\n\n\n\n\n      Join now\n    \n\n          Sign in\n      \n \n\n \n\n\n\n\n\n\n\n                  Bhavesh Arora’s Post\n\n\n\n\n\n\n\n\n \n\n\n\n              Bhavesh Arora\n            \n \n\n                IIT Jodhpur - Data Analyst | Helping youth by sharing legit jobs opportunities...🇮🇳\n            \n\n\n      

In [108]:
from langchain.prompts import PromptTemplate
class EmailId(BaseModel):
    Company_name: str = Field(..., description="Name of the company")
    Email: str = Field(..., description="Email of the company")
    
class EmailIdResponse(BaseModel):
    data: List[EmailId] = Field(..., description="List of companies and their email addresses")

# Updated prompt template that only includes companies with emails
template = """
Extract company names and their corresponding email addresses from the given text from a linkedin post, following these rules:

Important Rules:
1. ONLY include companies that have an explicitly mentioned email address in the text
2. Skip any company that doesn't have an associated email address
3. Each entry must have both a company name AND a valid email address
4. Do not infer or generate email addresses - only use ones explicitly present in the text
5. Ignore any content present in the comments section

Guidelines for extraction:
- Only extract real email addresses in the format: username@domain.com present in the main text only , ignnore comment section
- Maintain exact spelling and formatting of company names and emails
- If a company is mentioned multiple times but has no email, exclude it
- If an email is found without a clear company name, skip it
- Strictly ignore the content present in comments section. Only consider the main text
- Never include colleges , universities or personal email addresses. Only include companies and their email addresses

Text to analyze:
{text}

Return the data in a structured format where each entry contains only valid company-email pairs.
Ensure every company in the output has an associated email address.
Ensure that only companies are present in the output, never include colleges or individual personal profiles and their email addresses.
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["text"]
)

# Create the chain with structured output and prompt
structured_llama3 = llama3.with_structured_output(EmailIdResponse)
chain = prompt | structured_llama3

# Invoke the chain with your text
response = chain.invoke({"text": str(text.page_content[:4000])})

# Process the results
for company_email in response.data:
    print(f"Company: {company_email.Company_name}")
    print(f"Email: {company_email.Email}")
    print("-" * 30)

# Optional: Convert to other formats
companies_list = [{"company": item.Company_name, "email": item.Email} for item in response.data]
companies_dict = {item.Company_name: item.Email for item in response.data}

Company: Easecruit
Email: contact@easecruit.com
------------------------------
