### LinkedIn Web Scraper

This project scraps the LinkedIn site looking at connections of my account to store basic information including name, current role, past work experience, etc. The processed information is stored in a dataframe which could be exported as csv files for further data analysis.

In [126]:
# Importing the necessary modules for web scraping
import requests, time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


Selenium is used for navigating the Chrome web driver. Beautiful soup is then used to extract the information for processing.

#### Initialise Chrome webdriver and login to personal account

Input personal username and password in the relevant fields for login purposes

In [2]:
# Access webdriver application
PATH = "C:/Program Files (x86)/chromedriver.exe"
browser = webdriver.Chrome(PATH)

# Get to login page
browser.get("https://www.linkedin.com/uas/login")

# Input username and password
username=""
password=""

# Find html elements for username and password
usernameID = browser.find_element_by_id('username')
usernameID.send_keys(username)
passwordID = browser.find_element_by_id('password')
passwordID.send_keys(password)

# Submit username and password for login
passwordID.submit()

In [3]:
# Function to ensure that webdriver is loaded till the bottom of the page 
# before beautiful soup object is initialised to parse info
def scrollToBottom():
    # Pause time
    SCROLL_PAUSE_TIME = 5

    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")

    for i in range(3):
        # Sroll down to bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [7]:
# Build an instance of beautiful soup based on current browser page
def buildSoup():
    src = browser.page_source
    soup = BeautifulSoup(src, 'lxml')
    return soup

In [252]:
# Function to extract information while on a profile page
def getInfo(soup):
    
    # Get name of person, title of their role
    infoSection = soup.find("div", {"class": "ph5"})
    leftDiv = infoSection.find("div", {"class": "pv-text-details__left-panel"})
    name = leftDiv.find_all("div")[0].find("h1").text.strip()
    title = leftDiv.find_all("div")[1].text.strip()
    
    # Get organisation the person works in, location of their role
    rightDiv = infoSection.find("ul", {"class": "pv-text-details__right-panel"})
    if rightDiv:
        organisation = rightDiv.find("h2").text.strip()
        location = rightDiv.next_sibling.next_sibling.find("span").text.strip()
    
    # Create information dictionary
    if rightDiv:
        infoDict = {
            "Name": name,
            "Title": title,
            "Organisation": organisation,
            "Location": location
        }
    else:
        infoDict = {
            "Name": name,
            "Title": title
        }
    
    # Get work experience of the person
    exp = soup.find(lambda tag: tag.name == "span" and tag.text == "Experience")
    if not exp:
        return infoDict
    
    # Get work experience section
    expSection = exp.find_parent("section")
    expMore = expSection.find("span", {"class": "pvs-navigation__text"})
    
    # If there is "more experience" tab, click into tab
    if expMore:
        expMoreText = expMore.text.strip()
        element = browser.find_element_by_link_text(expMoreText)
        actions = ActionChains(browser)
        actions.move_to_element(element).perform()
        moreButton = WebDriverWait(browser, 20).until(
                                EC.element_to_be_clickable((By.LINK_TEXT, expMoreText)))
        moreButton.click()
        exp = soup.find(lambda tag: tag.name == "span" and tag.text == "Experience")
        expSection = exp.find_parent("section")
    
    # Get list of experiences
    expSection = expSection.find("ul")
    jobList = expSection.find_all("li")
    count = 1
    
    # Initialise a list to store information about their part jobs
    for job in jobList:
        jobTitle = job.find("span", {"class": "mr1 t-bold"})
        jobCompany = job.find("span", {"class": "t-14 t-normal"})
        jobPeriod = job.find("span", {"class": "t-14 t-normal t-black--light"})
        
        if jobTitle:
            jobTitle = jobTitle.find("span").text
            jobCompany = jobCompany.find("span").text
            jobCompany = jobCompany.split("·")[0]
            jobPeriod = jobPeriod.find("span").text
            jobString = "Job "+str(count)
            infoDict[jobString+" Title"] = jobTitle
            infoDict[jobString+" Company"] = jobCompany
            infoDict[jobString+" Period"] =  jobPeriod
            count += 1
    
    return infoDict

In [246]:
# Function to store extracted information into a dataframe
def storeDataFrame(infoDF, infoDict):
    # Add extra row for each person into dictionary 
    new_row = pd.DataFrame(infoDict, index = [0])
    infoDF = pd.concat([infoDF, new_row]).reset_index(drop = True)
    return infoDF

In [10]:
# Create link to connections list page
def createLink(connectionHref):
    connectionLink = "https://www.linkedin.com/" + connectionHref
    return connectionLink

In [11]:
# Function to add connection urls to a queue
def addProfile(profileQueue, connectionLink):
    
    # Access the connectionLink from initial link
    browser.get(connectionLink)
    
    # Count number of connections
    countConnections = 0
    
    # while loop that runs as long as there are extra pages of connections to access
    while True:
        scrollToBottom()
        connectionSoup = buildSoup()
        
        connectionList = connectionSoup.find_all("span", {"class": "entity-result__title-text t-16"})

        for connection in connectionList:
            connectionTag = connection.contents[1]
            profileHref = connectionTag["href"]
            if profileHref not in profileQueue:
                profileQueue.append(profileHref)
        
        countConnections += len(connectionList)
        
        try:
            nextButton = WebDriverWait(browser, 20).until(
                        EC.element_to_be_clickable((By.CLASS_NAME, "artdeco-pagination__button--next")))
        except:
            print(f"---Final number of profiles in queue: {countConnections}---")
            return profileQueue;
        print(f"---Current number of profiles in queue: {countConnections}---")
        nextButton.click()
            
    return profileQueu

Functions above used to extract information

In [249]:
# Initialise a dataframe to store information
infoDF = pd.DataFrame()

In [48]:
# Initialise a queue for processing profiles
profileQueue = []

Input relevant starting page to access connections

In [49]:
# Access link to personal page
initialLink = "https://www.linkedin.com/in/ching-hong-fung/"

# Access the initial link using webdriver
browser.get(initialLink)

# Scroll to bottom of page to access entire dom
scrollToBottom()

In [50]:
soup = buildSoup()
infoList = getInfo(soup)
print(infoList)

['Ching Hong (Jacky) Fung', 'Software Developer Intern at BGC Partners | First-Class MEng Engineering Graduate from University of Oxford', 'BGC Partners', 'London, England, United Kingdom']


In [51]:
# Create a beautiful soup instance for current profile page
soup = buildSoup()

# Extract information from soup
infoList = getInfo(soup)

# Store information into data frame
infoDF = storeDataFrame(infoDF, infoList)

In [52]:
infoDF.head()

Unnamed: 0,Name,Title,Organisation,Location
0,Ching Hong (Jacky) Fung,Software Developer Intern at BGC Partners | Fi...,BGC Partners,"London, England, United Kingdom"


In [53]:
profileQueue = []

#### Access connections to get information

Use selenium functions to look through all connections and storing them into a list for individual profile extraction later.

In [54]:
# Get link to list of connections
connectionSpan = soup.find("span", {"class": "link-without-visited-state"})
connectionTag = connectionSpan.parent
connectionHref = connectionTag["href"]
connectionLink = createLink(connectionHref)

In [55]:
# Get all connection urls
profileQueue = addProfile(profileQueue, connectionLink)

---Current number of profiles in queue: 10---
---Current number of profiles in queue: 20---
---Current number of profiles in queue: 30---
---Current number of profiles in queue: 40---
---Current number of profiles in queue: 50---
---Current number of profiles in queue: 60---
---Current number of profiles in queue: 70---
---Current number of profiles in queue: 80---
---Current number of profiles in queue: 90---
---Current number of profiles in queue: 100---
---Current number of profiles in queue: 110---
---Current number of profiles in queue: 120---
---Current number of profiles in queue: 130---
---Current number of profiles in queue: 140---
---Current number of profiles in queue: 150---
---Current number of profiles in queue: 160---
---Current number of profiles in queue: 170---
---Current number of profiles in queue: 180---
---Current number of profiles in queue: 190---
---Current number of profiles in queue: 200---
---Current number of profiles in queue: 210---
---Current number of p

In [None]:
# Access each profile to extract information
# for idx, profileUrl in enumerate(profileQueue):
for profileUrl in profileQueue:
    browser.get(profileUrl)
    scrollToBottom()
    
    # Create a beautiful soup instance for current profile page
    soup = buildSoup()

    # Extract information from soup
    infoDict = getInfo(soup)
    print(f"Processing {infoDict['Name']}'s profile")

    # Store information into data frame
    infoDF = storeDataFrame(infoDF, infoList)

Export data into csv file.

In [339]:
infoDF.to_csv('linkedin-info.csv', index=False)