### Data Mining in Website (Silver 3)

In [1]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


In [None]:
parquet_path = "./Staging/Gold/cleaned_second_592.parquet"
if os.path.exists(parquet_path):
    RecordOwl_Leads = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(RecordOwl_Leads)} rows from {parquet_path}")
    print(RecordOwl_Leads.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

filtered_df = RecordOwl_Leads[
    RecordOwl_Leads["PIC NAME 1 Contact Number"].notna() &
    (RecordOwl_Leads["PIC NAME 1 Contact Number"] != "")
]

filtered_df = filtered_df[
    (filtered_df["Facebook Page"].notna()) & 
    (filtered_df["Facebook Page"] != "")]

filtered_df = filtered_df[["PIC NAME 1 Contact Number", "Facebook Page"]]

filtered_df.tail(10)

In [None]:


# --- Initialize Apify client ---
APIFY_TOKEN = os.getenv("APIFY_TOKEN", "apify_api_0HQ8fc5fw5T1aosdacxKQNQYVBAEwi3tXaJc")
client = ApifyClient(APIFY_TOKEN)

# --- Async wrapper so you can run in Jupyter ---
async def enrich_with_contact_info(df):
    """Scrape contact info for rows where Website_Valid == 'valid' and Phones is empty."""
    updated_df = df.copy()

    for i, row in df.iterrows():
        website = row.get("Website")
        status = row.get("Website_Valid")
        phone = row.get("Phones")

        if not website or status != "valid" or phone:
            continue  # Skip invalid or already complete rows

        print(f"üîç Scraping contact page for: {website}")

        # --- CONVERTED TO PUPPETEER-SCRAPER (same as Cell 20) ---
        # Now using native Puppeteer syntax instead of jQuery
        run_input = {
            "startUrls": [{"url": website}],
            "pageFunction": r"""
                async function pageFunction(context) {
                    const { page, log, request } = context;
                    const isContact = request.userData?.isContact || false;

                    // If not on contact page yet, try to find and navigate to it
                    if (!isContact) {
                        try {
                            // Wait for page to load
                            await page.waitForSelector('a', { timeout: 10000 }).catch(() => null);
                            
                            // Find contact page link using Puppeteer
                            const contactUrl = await page.evaluate(() => {
                                const links = Array.from(document.querySelectorAll('a[href]'));
                                for (const link of links) {
                                    const href = link.getAttribute('href');
                                    if (href && href.toLowerCase().includes('contact')) {
                                        return href.startsWith('http') ? href : window.location.origin + href;
                                    }
                                }
                                return null;
                            });

                            if (contactUrl) {
                                await context.enqueueRequest({ 
                                    url: contactUrl, 
                                    userData: { isContact: true } 
                                });
                                log.info(`Enqueued contact page: ${contactUrl}`);
                            }
                            return null;
                        } catch (err) {
                            log.error(`Error finding contact page: ${err.message}`);
                            return null;
                        }
                    }

                    // We're on the contact page - extract emails and phones
                    try {
                        // Wait for content to load
                        await new Promise(r => setTimeout(r, 3000));

                        // Extract emails and phones using Puppeteer
                        const contactData = await page.evaluate(() => {
                            // Helper: check if element is visible
                            function isVisible(el) {
                                return el && el.offsetParent !== null;
                            }

                            // Extract emails from mailto links
                            const emailLinks = Array.from(document.querySelectorAll('a[href^="mailto"]'));
                            const emails = emailLinks
                                .filter(el => isVisible(el))
                                .map(el => el.getAttribute('href').replace('mailto:', '').trim())
                                .filter(email => email.length > 0);

                            // Extract phones from tel links
                            const phoneLinks = Array.from(document.querySelectorAll('a[href^="tel"]'));
                            const phones = phoneLinks
                                .filter(el => isVisible(el))
                                .map(el => el.getAttribute('href').replace(/[^0-9]/g, ''))
                                .filter(phone => phone.length > 0);

                            return {
                                emails: [...new Set(emails)],
                                phones: [...new Set(phones)]
                            };
                        });

                        return {
                            contactUrl: request.url,
                            emails: contactData.emails.length ? contactData.emails : [],
                            phones: contactData.phones.length ? contactData.phones : []
                        };
                    } catch (err) {
                        log.error(`Error extracting contact data: ${err.message}`);
                        return {
                            contactUrl: request.url,
                            emails: [],
                            phones: [],
                            error: err.message
                        };
                    }
                }
            """,
            "useChrome": True,
            "headless": True,
            "stealth": True,
            "ignoreSslErrors": False,
            "ignoreCorsAndCsp": False,
            "maxRequestRetries": 3,  # Increased retry attempts
            "maxRequestsPerCrawl": 0,  # No limit (will crawl main + contact pages)
            "maxConcurrency": 1,  # No parallel requests
            "pageLoadTimeoutSecs": 90,  # Optimized timeout
            "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
            "waitUntil": ["networkidle2"],  # Wait for network to be idle
            # OPTIMIZED: Residential proxies with recommended rotation
            "proxyConfiguration": {
                "useApifyProxy": True,
                "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
            },
            "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
        }

        # --- Run the Apify scraper (NOW USING PUPPETEER-SCRAPER) ---
        try:
            print(f"  üì° Starting Apify puppeteer-scraper...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            # Wait for dataset to be ready
            time.sleep(3)
            
            dataset = client.dataset(run["defaultDatasetId"])
            results = list(dataset.iterate_items())
            contact_results = [r for r in results if r and (r.get("emails") or r.get("phones"))]

            if contact_results:
                scraped = contact_results[0]
                updated_df.at[i, "Emails"] = scraped.get("emails", None)
                updated_df.at[i, "Phones"] = scraped.get("phones", None)
                updated_df.at[i, "Contact_Page"] = scraped.get("contactUrl", None)
                print(f"  ‚úÖ Found: {scraped.get('phones', [])} / {scraped.get('emails', [])}")
            else:
                print("  ‚ö†Ô∏è No contact data found.")

        except Exception as e:
            print(f"  ‚ùå Error scraping {website}: {e}")
        
        # Add delay to avoid rate limiting
        time.sleep(5)

    return updated_df


# --- Run the scraper for valid websites ---
result_df = await enrich_with_contact_info(result_df)

# --- Display updated results ---
display(result_df)


NameError: name 'result_df' is not defined