#Import

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

#Get API

In [2]:
SERPAPI_KEY = '3e33047b5d45eba0a802a51a9deed3080ae17717eb542cd3bad23ff48126a1cb'

#Search Function

In [8]:
def get_serp_results(query):
    search_url = f'https://serpapi.com/search?q={query}&api_key={SERPAPI_KEY}'
    response = requests.get(search_url)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

def extract_email(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.replace("[at]", "@").replace("[dot]", ".")

    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_regex, text)

    return emails if emails else "N/A"

def extract_creation_date(soup):
    # Meta tag - published time
    meta_date = soup.find('meta', {'property': 'article:published_time'})
    if meta_date and meta_date.get('content'):
        return meta_date['content']

    # Meta tag - name="date"
    meta_date = soup.find('meta', {'name': 'date'})
    if meta_date and meta_date.get('content'):
        return meta_date['content']

    # Tag time
    time_tag = soup.find('time')
    if time_tag and time_tag.get('datetime'):
        return time_tag['datetime']
    elif time_tag:
        return time_tag.get_text()

    # Regex pada teks halaman
    # Regex on page text
    text = soup.get_text(separator=' ')
    date_patterns = [
        r'\b\d{4}-\d{2}-\d{2}\b',                # 2024-04-08
        r'\b\d{2}/\d{2}/\d{4}\b',                # 08/04/2024
        r'\b\d{1,2} \w+ \d{4}\b',                # 8 April 2024
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()

    return "N/A"

In [13]:
def extract_company_info(query):
    results = get_serp_results(query)

    if results and 'organic_results' in results:
        company_info = []

        for result in results['organic_results']:
            company_name = result.get('title', 'N/A')
            company_url = result.get('link', 'N/A')

            try:
                response = requests.get(company_url, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    page_emails = extract_email(soup.get_text())
                    all_links = find_all_links(soup)
                    creation_date = extract_creation_date(soup)
                else:
                    page_emails = ["N/A"]
                    all_links = ["N/A"]
                    creation_date = "N/A"
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {company_url}: {e}")
                page_emails = ["N/A"]
                all_links = ["N/A"]
                creation_date = "N/A"

            company_info.append({
                'company_name': company_name,
                'company_url': company_url,
                'informal_emails': page_emails,
                'links': all_links,
                'creation_date': creation_date
            })

            time.sleep(2)
            # Delay untuk jaga-jaga anti blocking
            # Delay to prevent blocking

        return company_info
    else:
        print("No results found.")
        return []

# Contoh Penggunaan
# Example of use
query = "Saas Agency"
company_data = extract_company_info(query)

# Result
for company in company_data:
    print("\nCompany Name:", company['company_name'])
    print("Company URL:", company['company_url'])
    print("Emails:", company['informal_emails'])
    print("Links:", company['links'][:5])  # Print 5 link pertama
    print("Creation Date:", company['creation_date'])


Company Name: B2B SaaS Marketing Agency | SaaS Growth | Kalungi, Inc.
Company URL: https://www.kalungi.com/
Emails: ['contact@kalungi.com']
Links: ['https://www.kalungi.com', 'https://www.kalungi.com/services/full-service-b2b-saas-marketing-team', 'https://www.kalungi.com/services/audit', 'https://www.kalungi.com/services/saas-cmo', 'https://www.kalungi.com/services/coach']
Creation Date: N/A

Company Name: SaaS Marketing Agency Driving Predictable Growth
Company URL: https://www.simpletiger.com/
Emails: N/A
Links: ['/', '/saas-seo-agency', '/services/seo', '/services/keyword-research', '/services/technical-seo']
Creation Date: 12 months 1200

Company Name: Growfusely: SaaS Content Marketing Agency
Company URL: https://growfusely.com/
Emails: N/A
Links: ['https://growfusely.com', 'index.php', 'https://growfusely.com/contact-us/', 'https://growfusely.com/about-us/', 'https://growfusely.com/about-us/']
Creation Date: N/A

Company Name: Saas, Agency or Job?
Company URL: https://www.reddi