In [None]:
\import os
import logging
import time
import warnings
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor

# Load environment variables
load_dotenv()
LI_AT = os.getenv("LI_AT")

# Configure logging
logging.basicConfig(level=logging.INFO)

# Suppress warnings
warnings.filterwarnings("ignore")

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--headless")  # Add this line to run in headless mode
    driver = webdriver.Chrome(options=options)
    return driver

def use_session_cookie(driver):
    """Use a session ID to log in to LinkedIn."""
    logging.info("Using session ID to authenticate")
    driver.get("https://www.linkedin.com")  # Open LinkedIn home page
    driver.add_cookie({
        'name': 'li_at',
        'value': LI_AT,
        'domain': '.linkedin.com'
    })
    driver.refresh()

def scroll_down(driver):
    """Scroll down to load all dynamic content."""
    logging.info("Scrolling down the page...")
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(1)  # Reduced sleep time
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def extract_profile_data(driver, url):
    logging.info(f"Scraping profile data from {url}")
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    scroll_down(driver)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')

    profile_data = {}

    # Basic Profile Information
    name = soup.find('h1', {'class': 'text-heading-xlarge'})
    if name:
        profile_data['name'] = name.get_text(strip=True)

    # Headline
    headline = soup.find('div', {'class': 'text-body-medium break-words'})
    if headline:
        profile_data['headline'] = headline.get_text(strip=True)
    
    # About Section
    about_section = soup.find('div', {'class': 'display-flex ph5 pv3'})
    if about_section:
        profile_data['about'] = about_section.get_text(strip=True)
    
    # Get all sections once
    sections = soup.find_all('section', {'class': 'artdeco-card pv-profile-card break-words mt2'})
    
    # Education Section
    for sec in sections:
        if sec.find('div', {'id': 'education'}):
            profile_data['education'] = sec.get_text(strip=True)
            break
    
    # Experience Section
    for sec in sections:
        if sec.find('div', {'id': 'experience'}):
            experience_items = sec.find_all('li')
            profile_data['experience'] = list(dict.fromkeys([item.get_text(strip=True) for item in experience_items]))
            break
    
    # Organizations Section
    for sec in sections:
        if sec.find('div', {'id': 'organizations'}):
            organization_items = sec.find_all('li', {'class': 'artdeco-list__item'})
            profile_data['organizations'] = [{
                'name': item.find('span', {'aria-hidden': 'true'}).get_text(strip=True) if item.find('span', {'aria-hidden': 'true'}) else None,
                'role': item.find('span', {'class': 't-14 t-normal'}).get_text(strip=True) if item.find('span', {'class': 't-14 t-normal'}) else None
            } for item in organization_items]
            break
    
    # Interests Section Categorization
    interests_section = None
    for sec in sections:
        if sec.find('div', {'id': 'interests'}):
            interests_section = sec
            break
    
    profile_data['interests'] = {
        'Top Voices': [],
        'Companies': [],
        'Groups': [],
        'Newsletters': []
    }
    
    if interests_section:
        interest_items = interests_section.find_all('li')
        for item in interest_items:
            text = item.get_text(strip=True)
            if any(kw in text.lower() for kw in ["chair", "head", "keynote", "speaker", "researcher", "coach"]):
                profile_data['interests']['Top Voices'].append(text.split("\n")[0])
            elif "followers" in text.lower():
                profile_data['interests']['Companies'].append(text.split("followers")[0].strip())
            elif "members" in text.lower():
                profile_data['interests']['Groups'].append(text.split("members")[0].strip())
            elif "published" in text.lower():
                profile_data['interests']['Newsletters'].append(text.split("published")[0].strip())
    
    return profile_data

def scrape_website(url):
    driver = get_driver()
    try:
        use_session_cookie(driver)
        profile_data = extract_profile_data(driver, url)
        print(json.dumps(profile_data, indent=4))  # Output as JSON
        return profile_data
    finally:
        driver.quit()

def scrape_multiple_profiles(urls):
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(scrape_website, urls))
    return results

if __name__ == "__main__":
    linkedin_urls = [
        "https://www.linkedin.com/in/rajender-sharma-bb2a7718/",
        # Add more LinkedIn profile URLs here
    ]
    scrape_multiple_profiles(linkedin_urls)

INFO:root:Using session ID to authenticate
