### LinkedIn Web Scraper

This project scraps the LinkedIn site looking at connections of my account to store basic information including name, current role, past work experience, etc. The processed information is stored in a dataframe which could be exported as csv files for further data analysis.

In [342]:
# Importing the necessary modules for web scraping
import requests, time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Selenium is used for navigating the Chrome web driver. Beautiful soup is then used to extract the information for processing.

#### Initialise Chrome webdriver and login to personal account

Input personal username and password in the relevant fields for login purposes

In [343]:
# Access webdriver application
PATH = "C:/Program Files (x86)/chromedriver.exe"
browser = webdriver.Chrome(PATH)

# Get to login page
browser.get("https://www.linkedin.com/uas/login")

# Input username and password
username=""
password=""

# Find html elements for username and password
usernameID = browser.find_element_by_id('username')
usernameID.send_keys(username)
passwordID = browser.find_element_by_id('password')
passwordID.send_keys(password)

# Submit username and password for login
passwordID.submit()

In [74]:
# Function to ensure that webdriver is loaded till the bottom of the page 
# before beautiful soup object is initialised to parse info
def scrollToBottom():
    # Pause time
    SCROLL_PAUSE_TIME = 5

    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")

    for i in range(3):
        # Sroll down to bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [75]:
# Build an instance of beautiful soup based on current browser page
def buildSoup():
    src = browser.page_source
    soup = BeautifulSoup(src, 'lxml')
    return soup

In [334]:
# Function to extract information while on a profile page
def getInfo(soup):
    
    # Get name of person, title of their role
    infoSection = soup.find("div", {"class": "ph5"})
    leftDiv = infoSection.find("div", {"class": "pv-text-details__left-panel"})
    name = leftDiv.find_all("div")[0].find("h1").text.strip()
    title = leftDiv.find_all("div")[1].text.strip()
    
    # Get organisation the person works in, location of their role
    rightDiv = infoSection.find("ul", {"class": "pv-text-details__right-panel"})
    organisation = rightDiv.find("h2").text.strip()
    location = rightDiv.next_sibling.next_sibling.find("span").text.strip()
    
    # Get work experience of the person
    exp = soup.find(lambda tag: tag.name == "span" and "Experience" in tag.text)
    if exp == None:
        return [name, title, organisation, location]
    expSection = exp.find_parent("section")
    expSection = expSection.find("ul")
    jobList = expSection.find_all("li")
    # Initialise a list to store informatino about their part jobs
    jobs = []
    for job in jobList:
        jobTitle = job.find("span", {"class": "t-bold mr1 hoverable-link-text"})
        jobCompany = job.find("span", {"class": "t-14 t-normal"})
        jobPeriod = job.find("span", {"class": "t-14 t-normal t-black--light"})
        if jobTitle is not None:
            jobTitle = jobTitle.find("span").text
            jobCompany = jobCompany.find("span").text
            jobCompany = jobCompany.split("·")[0]
            jobPeriod = jobPeriod.find("span").text
            jobDates = jobPeriod.split("-")
            if len(jobDates) == 2:
                jobStart = jobDates[0].strip()
                jobEnd = jobDates[1].split("·")[0].strip()
            else:
                jobStart = jobPeriod.split("·")[0].strip()
                jobEnd = jobPeriod.split("·")[0].strip()

            jobs.extend([jobTitle, jobCompany, jobStart, jobEnd])
    
    return [name, title, organisation, location] + jobs

In [248]:
# Function to store extracted information into a dataframe
def storeDataFrame(infoDF, infoList):
    name = infoList[0]
    title = infoList[1]
    organisation = infoList[2]
    location = infoList[3]
    
    # Initialise dictionary to store basic information
    infoDict = {'Name': name, 'Title': title, 'Organisation': organisation, 'Location': location}
    
    # Add each job into the dictionary
    count = 1
    for idx in range(4, len(infoList)):
        if idx % 4 == 0:
            key = "Job Title " + str(count)
        elif idx % 4 == 1:
            key = "Comany " + str(count)
        elif idx % 4 == 2:
            key = "Start Date " + str(count)
        elif idx % 4 == 3:
            key = "End Date " + str(count)
            count += 1
        infoDict[key] = infoList[idx]
        
    # Add extra row for each person into dictionary 
    new_row = pd.DataFrame(infoDict, index = [0])
    infoDF = pd.concat([new_row, infoDF]).reset_index(drop = True)
    return infoDF

In [194]:
# Create link to connections list page
def createLink(connectionHref):
    connectionLink = "https://www.linkedin.com/" + connectionHref
    return connectionLink

In [195]:
# Function to add connection urls to a queue
def addProfile(profileQueue, connectionLink):
    
    # Access the connectionLink from initial link
    browser.get(connectionLink)
    
    # while loop that runs as long as there are extra pages of connections to access
    while True:
        scrollToBottom()
        connectionSoup = buildSoup()
        
        connectionList = connectionSoup.find_all("span", {"class": "entity-result__title-text t-16"})

        for connection in connectionList:
            connectionTag = connection.contents[1]
            profileHref = connectionTag["href"]
            if profileHref not in profileQueue:
                profileQueue.append(profileHref)
        
        nextButton = WebDriverWait(browser, 50).until(EC.element_to_be_clickable((By.CLASS_NAME, "artdeco-pagination__button--next")))
        if 'artdeco-button--disabled' in nextButton.get_attribute('class'):
            break;
        nextButton.click()
            
    return profileQueue

Functions above used to extract information

In [None]:
# Initialise a dataframe to store information
infoDF = pd.DataFrame()

# Initialise a queue for processing profiles
profileQueue = []

Input relevant starting page to access connections

In [324]:
# Access link to personal page
initialLink = ""

# Access the initial link using webdriver
browser.get(initialLink)

# Scroll to bottom of page to access entire dom
scrollToBottom()

In [325]:
# Create a beautiful soup instance for current profile page
soup = buildSoup()

# Extract information from soup
infoList = getInfo(soup)

# Store information into data frame
infoDF = storeDataFrame(infoDF, infoList)

#### Access connections to get information

Use selenium functions to look through all connections and storing them into a list for individual profile extraction later.

In [83]:
# Get link to list of connections
connectionSpan = soup.find("span", {"class": "link-without-visited-state"})
connectionTag = connectionSpan.parent
connectionHref = connectionTag["href"]
connectinLink = createLink(connectionHref)

In [None]:
# Get all connection urls
profileQueue = addProfile(profileQueue, connectionLink)

In [320]:
print(f'Total number of connecitions: {len(profileQueue)}')

Total number of connecitions: 254


In [None]:
# Access each profile to extract information
for idx, profileUrl in enumerate(profileQueue):
    browser.get(profileUrl)
    scrollToBottom()
    
    # Create a beautiful soup instance for current profile page
    soup = buildSoup()

    # Extract information from soup
    infoList = getInfo(soup)
    print(f"Processing {infoList[0]}'s profile")

    # Store information into data frame
    infoDF = storeDataFrame(infoDF, infoList)

Export data into csv file.

In [339]:
infoDF.to_csv('linkedin-info.csv', index=False)