## 1. Importing required modules

In [9]:
import os
import sys
import json
import selenium
from time import sleep

from selenium import webdriver
from selenium.webdriver import ActionChains

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver.common.actions.mouse_button import MouseButton

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [10]:
os.makedirs("Documents", exist_ok = True)

In [11]:
print(selenium.__version__)

4.21.0


## 2. Initial setup for the project

In [15]:
# Configuring the options for the chromium webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [16]:
# Getting the webdriver

driver = webdriver.Chrome(options = chrome_options)

ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [8]:
# Logging into Linkedin Account using the cookie value


cookie_val = input("Please enter the cookie value: ")

driver.get("https://www.linkedin.com/login")
login_cookie = {
    "name" : "li_at",
    "value" : cookie_val,
    "domain" : ".linkedin.com",
    "path" : "/",
    "secure" : True
}
    
driver.add_cookie(login_cookie)
driver.refresh()

NameError: name 'driver' is not defined

In [9]:
# Example profiles to be scraped

manoj_url = "https://www.linkedin.com/in/mkpentapalli/"

## 3. Required functions

### 3.1 Functions for generic elements

In [10]:
# Type of element taken:

element_type_dict = {
    0 : "CSS_SELECTOR",
    1 : "CLASS_NAME",
    2 : "ID",
    3 : "TAG_NAME",
    4 : "XPATH"
}

In [11]:
# getting the child element from the parent element

def get_element(parent_element=None, child_element_name="", element_type=1, wait_time=8):
    try:
        by_attr = getattr(By, element_type_dict[element_type])
        child_element = WebDriverWait(parent_element, wait_time).until(EC.presence_of_element_located((by_attr, child_element_name)))
        return child_element
    except Exception as e:
        print(f"{type(e).__name__} occurred for the child element name {child_element_name}.")
        print(e)
        pass

In [12]:
# getting the child elements from the parent element

def get_elements(parent_element=None, child_elements_name="", element_type=1, wait_time=8):
    try:
        by_attr = getattr(By, element_type_dict[element_type])
        child_elements = WebDriverWait(parent_element, wait_time).until(EC.presence_of_all_elements_located((by_attr, child_elements_name)))
        return child_elements
    except Exception as e:
        print(f"{type(e).__name__} occurred for the child element name {child_elements_name}.")
        print(e)
        pass

In [13]:
# getting elements from a list of parent elements

def get_elements_from_list(parent_list=[], child_element="", element_type=1, wait_time = 3):
    element_list = []
    for element in parent_list:
        element_list_temp = get_elements(element, child_element, element_type)
        content = ""
        for temp_element in element_list_temp:
            content += temp_element.text + "\n"
        element_list.append(content)

    return element_list

### 3.2 Functions for specific elements:

In [14]:
# Name

def get_name(parent_element = None) -> str:
    name = None
    if parent_element:
        name_elem = get_element(parent_element, "h1", 3)
        name = name_elem.text if name_elem else None
    return name

In [15]:
# Pronouns

def get_pronouns(parent_element = None) -> str:
    pronouns = None
    if parent_element:
        pronouns_elem = get_element(parent_element, ".text-body-small.v-align-middle.break-words.t-black--light", 0)
        pronouns = pronouns_elem.text if pronouns_elem else None
    return pronouns

In [16]:
# Location

def get_location(parent_element = None) -> str:
    location = None
    if parent_element:
        location_elem = get_element(parent_element, ".text-body-small.inline.t-black--light.break-words", 0)
        location = location_elem.text if location_elem else None
    return location

In [17]:
# Intro

def get_intro(parent_element = None) -> str:
    intro = None
    if parent_element:
        intro_elem = get_element(parent_element, ".text-body-medium.break-words", 0)
        intro = intro_elem.text if intro_elem else None
    return intro

In [18]:
# Work Preference

def get_work_preference(parent_element = None) -> str:
    photo_element = get_element(parent_element, "pv-top-card--photo-resize", 1)
    img_element = get_element(photo_element, "img", 3)
    work_preference = img_element.get_attribute("alt") if img_element else ""
    work_preference_text = None
    if("#OPEN_TO_WORK" in work_preference):
        work_preference_text = "Open to Work"
    elif("#HIRING" in work_preference):
        work_preference_text = "Hiring"
    return work_preference_text

In [19]:
# Job Preferences

def get_job_details(parent_element = None) -> dict:
    job_pref_dict = dict()
    if parent_element:
        li_elements = get_elements(parent_element, "li", 3)
        if li_elements:
            for li_element in li_elements:
                index_elem = get_element(li_element, ".t-14.t-bold.mb2", 0)
                if index_elem:
                    index = index_elem.text
                    value_elem = get_element(li_element, ".po-job-opportunities-view-preference__answer-section.t-14", 0)
                    if value_elem:
                        values = value_elem.text.split('Â·')
                        values = [x.strip() for x in values]
                        job_pref_dict[index] = values
    return job_pref_dict

In [20]:
# Contact details

def get_contact_details(parent_element = None):
    contact_details = dict()
    if parent_element:
        contact_elements = get_elements(parent_element, "section", 3)
        if contact_elements:
            for contact_element in contact_elements:
                index_element = get_element(contact_element, "h3", 3)
                if index_element:
                    index = index_element.text
                    if "Websites" in index or "IM" in index or "Phone" in index:
                        if "IM" in index:
                            index = "Instant Messaging"
                        contact_details[index] = dict()
                        if "Websites" in index:
                            list_websites = get_elements(contact_element, "li", 3)
                            if list_websites:
                                for website_element in list_websites:
                                    website_url = get_element(website_element, "a", 3).text
                                    website_type = get_element(website_element, "span", 3).text.lstrip('(').rstrip(')')
                                    contact_details[index][website_type] = website_url
                        else:
                            elements_list = get_elements(contact_element, "li", 3)
                            if elements_list:
                                for element in elements_list:
                                    single_row = get_elements(element, "span", 3)
                                    row_value = single_row[0].text
                                    row_index = single_row[1].text.lstrip('(').rstrip(')')
                                    contact_details[index][row_index] = row_value
                    else:
                        value_element = get_element(contact_element, "div", 3)
                        value = value_element.text if value_element else None
                        if "Your Profile" in index:
                            index = "Linkedin_Profile_URL"
                        contact_details[index] = value
    return contact_details

In [21]:
# About section

def get_about(parent_element = None) -> str:
    about = None
    if parent_element:
        about_element = get_element(parent_element, ".display-flex.ph5.pv3", 0)
        about_element_span = get_element(about_element, "span", 3)
        about = about_element_span.text if about_element_span else ""
    return about

### 3.2 Caller function

In [22]:
def getPersonalDetails(driver = None, profile_url = "") -> str:
    name = None
    location = None
    intro = None
    work_preference = None
    contact = None
    about = None
    experiences = None
    education = None
    pronouns = None
    personal_details = dict()
    details = ""


    driver.get(profile_url)



    # Scraping for profile details:

    top_card = get_element(driver, "scaffold-layout__main")
    top_panel = get_element(top_card, ".mt2.relative", 0)



    # Name:
    name = get_name(top_panel)
    personal_details["Name"] = name



    # Pronouns:
    pronouns = get_pronouns(top_panel)
    personal_details["Pronouns"] = pronouns



    # headline:
    intro = get_intro(top_panel)
    personal_details["Headline"] = intro



    # Location:
    location = get_location(top_panel)
    personal_details["Location"] = location



    # Work Preference:
    work_preference = get_work_preference(top_card)
    personal_details["Work_Preference"] = work_preference



    # If open to work:
    personal_details["Job_Preferences"] = None
    if personal_details["Work_Preference"] == "Open to Work":
        carousel = None
        if top_card:
            carousel = get_element(top_card, "pv-open-to-carousel", 1)
            if carousel:
                clickable_carousel = get_element(carousel, "a", 3)
                ActionChains(driver).click(clickable_carousel).perform()
                sleep(3)
                main_job_pref_card = get_element(driver,".artdeco-modal__content.ember-view.relative", 0)
                main_job_pref_list = get_element(main_job_pref_card,".list-style-none.mv2.ml5", 0)
                main_job_pref_dict = get_job_details(main_job_pref_list)
        personal_details["Job_Preferences"] = main_job_pref_dict



    # Go Back to profile
    driver.get(profile_url)
    top_panel = get_element(driver, ".mt2.relative", 0)


    # Contact:
    personal_details["Contact_Details"] = None
    if top_panel:
        profile_button_element = get_element(top_panel, "top-card-text-details-contact-info", 2)
        if profile_button_element:
            ActionChains(driver).click(profile_button_element).perform()
            sleep(3)
            contact_card_element = get_element(driver, ".pv-profile-section__section-info.section-info", 0)
            if contact_card_element:
                contact_dict = get_contact_details(contact_card_element)
                personal_details["Contact_Details"] = contact_dict


    # Go Back to profile
    driver.get(profile_url)
    top_panel = get_element(driver, ".mt2.relative", 0)


    # About:
    driver.get(profile_url)
    top_card = get_element(driver, "scaffold-layout__main", 1)
    about = get_about(top_card)
    if about:
        personal_details["About"] = about


    # # # Experiences:
    # experiences_url = profile_url + "details/experience/"
    # driver.get(experiences_url)
    # experiences_top_card = get_element(driver, "scaffold-layout__main",1)
    # experiences_dict = dict()
    # experiences_list = get_elements(experiences_top_card, ".pvs-list__paged-list-item.artdeco-list__item.pvs-list__item--line-separated.pvs-list__item--one-column", 0)
    # for index in range(len(experiences_list)):
    #     # Finding if same company has multiple roles or not:
    #     multiples_roles_card = 

    # experiences = get_elements_from_list(experiences_list_card,"visually-hidden")
    # if experiences:
    #     personal_details["Experience"] = ""
    #     for item in experiences:
    #         personal_details["Experience"] += item

    # # Education:
    # education_url = profile_url + "details/education/"
    # driver.get(education_url)
    # education_top_card = get_element(driver, "scaffold-layout__main")
    # education_list_card = get_elements(education_top_card,".pvs-list__paged-list-item.artdeco-list__item.pvs-list__item--line-separated.pvs-list__item--one-column",0)
    # education = get_elements_from_list(education_list_card,"visually-hidden",1)
    # if education:
    #     personal_details["Education"] = ""
    #     for item in education:
    #         personal_details["Education"] += item

    for key in personal_details.keys():
        # print(key, "\n\n\n", personal_details[key])
        details += f"{key}:\n\n{personal_details[key]}\n\n\n"

    return details

In [23]:
names = []
urls = []

# Example urls and names:

names = ["Manoj"]
urls = ["https://www.linkedin.com/in/mkpentapalli/"]


In [24]:
for i in range(len(urls)):
    personal_details = getPersonalDetails(driver, urls[i])
    json_str = json.dumps(personal_details, indent = 4)
    with open(f"Documents/{names[i]}.json","w") as f:
        f.write(json_str)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/workspace/scrape-linkedIn/linkedin_scraper/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
  File "/tmp/ipykernel_954/3404691917.py", line 2, in <module>
    personal_details = getPersonalDetails(driver,urls[i])
                                          ^^^^^^
NameError: name 'driver' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/workspace/scrape-linkedIn/linkedin_scraper/lib/python3.12/site-packages/pygments/styles/__init__.py", line 45, in get_style_by_name
ModuleNotFoundError: No module named 'pygments.styles.default'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/workspace/scrape-linkedIn/linkedin_scraper/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 2168, in showtraceback
  File "/workspace/scrape-linkedIn/linkedin_scraper/lib/py

In [21]:
driver.quit()