### LinkedIn Web Scraper

This project scraps the LinkedIn site looking at connections of my account to store basic information including name, current role, past work experience, etc. The processed information is stored in a dataframe which could be exported as csv files for further data analysis.

In [1]:
# Importing the necessary modules for web scraping
import requests, time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


Selenium is used for navigating the Chrome web driver. Beautiful soup is then used to extract the information for processing.

#### Initialise Chrome webdriver and login to personal account

Input personal username and password in the relevant fields for login purposes

In [2]:
# Access webdriver application
PATH = "C:/Program Files (x86)/chromedriver.exe"
browser = webdriver.Chrome(PATH)

# Get to login page
browser.get("https://www.linkedin.com/uas/login")

# Input username and password
username="jacky.fung.20xx@gmail.com"
password="555913"

# Find html elements for username and password
usernameID = browser.find_element_by_id('username')
usernameID.send_keys(username)
passwordID = browser.find_element_by_id('password')
passwordID.send_keys(password)

# Submit username and password for login
passwordID.submit()

In [3]:
# Function to ensure that webdriver is loaded till the bottom of the page 
# before beautiful soup object is initialised to parse info
def scrollToBottom():
    # Pause time
    SCROLL_PAUSE_TIME = 5

    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")

    for i in range(3):
        # Sroll down to bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [4]:
# Build an instance of beautiful soup based on current browser page
def buildSoup():
    src = browser.page_source
    soup = BeautifulSoup(src, 'lxml')
    return soup

In [74]:
# Function to extract information while on a profile page
def getInfo(soup):
    
    # Get name of person, title of their role
    infoSection = soup.find("div", {"class": "ph5"})
    leftDiv = infoSection.find("div", {"class": "pv-text-details__left-panel"})
    name = leftDiv.find_all("div")[0].find("h1").text.strip()
    title = leftDiv.find_all("div")[1].text.strip()
    
    # Get organisation the person works in, location of their role
    rightDiv = infoSection.find("ul", {"class": "pv-text-details__right-panel"})
    if rightDiv is not None:
        organisation = rightDiv.find("h2").text.strip()
        location = rightDiv.next_sibling.next_sibling.find("span").text.strip()
    
    # Get work experience of the person
    exp = soup.find(lambda tag: tag.name == "span" and tag.text == "Experience")
    if exp is None:
        return [name, title, organisation, location]
    expSection = exp.find_parent("section")
    expSection = expSection.find("ul")
    jobList = expSection.find_all("li")
    # Initialise a list to store information about their part jobs
    jobs = []
    for job in jobList:
        jobTitle = job.find("span", {"class": "t-bold mr1 hoverable-link-text"})
        jobCompany = job.find("span", {"class": "t-14 t-normal"})
        jobPeriod = job.find("span", {"class": "t-14 t-normal t-black--light"})
        if jobTitle is not None:
            jobTitle = jobTitle.find("span").text
            jobCompany = jobCompany.find("span").text
            jobCompany = jobCompany.split("·")[0]
            jobPeriod = jobPeriod.find("span").text
            jobDates = jobPeriod.split("-")
            if len(jobDates) == 2:
                jobStart = jobDates[0].strip()
                jobEnd = jobDates[1].split("·")[0].strip()
            else:
                jobStart = jobPeriod.split("·")[0].strip()
                jobEnd = jobPeriod.split("·")[0].strip()

            jobs.extend([jobTitle, jobCompany, jobStart, jobEnd])
    
    if rightDiv is not None:
        return [name, title, organisation, location] + jobs
    else:
        return [name, title] + jobs

In [75]:
# Function to store extracted information into a dataframe
def storeDataFrame(infoDF, infoList):
    if len(infoList) == 2:
        name = infoList[0]
        title = infoList[1]
        infoDict = {'Name': name, 'Title': title, 'Organisation': '', 'Location': ''}
    elif len(infoList) == 4:
        name = infoList[0]
        title = infoList[1]
        organisation = infoList[2]
        location = infoList[3]
        infoDict = {'Name': name, 'Title': title, 'Organisation': organisation, 'Location': location}
    
    # Add each job into the dictionary
    count = 1
    for idx in range(4, len(infoList)):
        if idx % 4 == 0:
            key = "Job Title " + str(count)
        elif idx % 4 == 1:
            key = "Comany " + str(count)
        elif idx % 4 == 2:
            key = "Start Date " + str(count)
        elif idx % 4 == 3:
            key = "End Date " + str(count)
            count += 1
        infoDict[key] = infoList[idx]
        
    # Add extra row for each person into dictionary 
    new_row = pd.DataFrame(infoDict, index = [0])
    infoDF = pd.concat([infoDF, new_row]).reset_index(drop = True)
    return infoDF

In [7]:
# Create link to connections list page
def createLink(connectionHref):
    connectionLink = "https://www.linkedin.com/" + connectionHref
    return connectionLink

In [54]:
# Function to add connection urls to a queue
def addProfile(profileQueue, connectionLink):
    
    # Access the connectionLink from initial link
    browser.get(connectionLink)
    
    # Count number of connections
    countConnections = 0
    
    # while loop that runs as long as there are extra pages of connections to access
    while True:
        scrollToBottom()
        connectionSoup = buildSoup()
        
        connectionList = connectionSoup.find_all("span", {"class": "entity-result__title-text t-16"})

        for connection in connectionList:
            connectionTag = connection.contents[1]
            profileHref = connectionTag["href"]
            if profileHref not in profileQueue:
                profileQueue.append(profileHref)
        
        countConnections += len(connectionList)
        
        try:
            nextButton = WebDriverWait(browser, 20).until(
                        EC.element_to_be_clickable((By.CLASS_NAME, "artdeco-pagination__button--next")))
        except:
            print(f"---Final number of profiles in queue: {countConnections}---")
            return profileQueue;
        print(f"---Current number of profiles in queue: {countConnections}---")
        nextButton.click()
            
    return profileQueue

Functions above used to extract information

In [66]:
# Initialise a dataframe to store information
infoDF = pd.DataFrame()

In [55]:
# Initialise a queue for processing profiles
profileQueue = []

Input relevant starting page to access connections

In [49]:
# Access link to personal page
initialLink = "https://www.linkedin.com/in/ching-hong-fung/"

# Access the initial link using webdriver
browser.get(initialLink)

# Scroll to bottom of page to access entire dom
scrollToBottom()

In [50]:
# Create a beautiful soup instance for current profile page
soup = buildSoup()

# Extract information from soup
infoList = getInfo(soup)

# Store information into data frame
infoDF = storeDataFrame(infoDF, infoList)

In [51]:
infoDF.head()

Unnamed: 0,Name,Title,Organisation,Location,Job Title 1,Comany 1,Start Date 1,End Date 1,Job Title 2,Comany 2,...,Start Date 3,End Date 3,Job Title 4,Comany 4,Start Date 4,End Date 4,Job Title 5,Comany 5,Start Date 5,End Date 5
0,Ching Hong (Jacky) Fung,MEng finalist at University of Oxford,University of Oxford,"Oxford, England, United Kingdom",Student Researcher,"University of Oxford, Department of Engineerin...",Jun 2021,Sep 2021,Private Tutor,英寰教育,...,Aug 2019,Sep 2019,Technology Intern,China State Construction International Holding...,Jun 2019,Aug 2019,Research Student - CREST Award,British Science Association,Apr 2017,Apr 2018


#### Access connections to get information

Use selenium functions to look through all connections and storing them into a list for individual profile extraction later.

In [52]:
# Get link to list of connections
connectionSpan = soup.find("span", {"class": "link-without-visited-state"})
connectionTag = connectionSpan.parent
connectionHref = connectionTag["href"]
connectionLink = createLink(connectionHref)

In [56]:
# Get all connection urls
profileQueue = addProfile(profileQueue, connectionLink)

---Current number of profiles in queue: 10---
---Current number of profiles in queue: 20---
---Current number of profiles in queue: 30---
---Current number of profiles in queue: 40---
---Current number of profiles in queue: 50---
---Current number of profiles in queue: 60---
---Current number of profiles in queue: 70---
---Current number of profiles in queue: 80---
---Current number of profiles in queue: 90---
---Current number of profiles in queue: 100---
---Current number of profiles in queue: 110---
---Current number of profiles in queue: 120---
---Current number of profiles in queue: 130---
---Current number of profiles in queue: 140---
---Current number of profiles in queue: 150---
---Current number of profiles in queue: 160---
---Current number of profiles in queue: 170---
---Current number of profiles in queue: 180---
---Current number of profiles in queue: 190---
---Current number of profiles in queue: 200---
---Current number of profiles in queue: 210---
---Current number of p

In [68]:
# Access each profile to extract information
for idx, profileUrl in enumerate(profileQueue):
    browser.get(profileUrl)
    scrollToBottom()
    
    # Create a beautiful soup instance for current profile page
    soup = buildSoup()

    # Extract information from soup
    infoList = getInfo(soup)
    print(f"Processing {infoList[0]}'s profile")

    # Store information into data frame
    infoDF = storeDataFrame(infoDF, infoList)

Processing Yi Tu's profile
Processing Nicholas Hwong's profile
Processing Qi Chen's profile
Processing Peter Zhang's profile
Processing Shamil Amirov's profile
Processing Yingtong (Ashley) Chen's profile
Processing Jiaxi Geng's profile
Processing Jonathan Ho's profile
Processing Yichen (Mac) Zhou's profile
Processing (Tony) Mincong Zhang's profile
Processing Josiah Price's profile
Processing Yuan(Bryan) Lu's profile
Processing Claudio Bardhoshi's profile
Processing Aman Nath's profile
Processing Joseph Hunt's profile
Processing Monty Beresford's profile
Processing Taylor Yu's profile
Processing Clinton Ng's profile
Processing Jiayun Cao's profile
Processing Len Ma's profile
Processing Michael (Jiangtian) Yu's profile
Processing Qianhui (Doris) Zhao's profile
Processing Philip Gong's profile
Processing Kelly Ragyeom Kim's profile
Processing Hibban Rahman's profile
Processing Avi Kumar's profile
Processing Brian Tam's profile
Processing Matthew Chiu's profile
Processing Shuntian Liu's pr

AttributeError: 'NoneType' object has no attribute 'find'

In [76]:
infoDF

Unnamed: 0,Name,Title,Organisation,Location,Job Title 1,Comany 1,Start Date 1,End Date 1,Job Title 2,Comany 2,...,Start Date 3,End Date 3,Job Title 4,Comany 4,Start Date 4,End Date 4,Job Title 5,Comany 5,Start Date 5,End Date 5
0,Yi Tu,Student at University of Oxford,University of Oxford,Greater Oxford Area,Quantitative Research Intern,Jump Trading LLC,Aug 2021,Nov 2021,Global Markets Summer Analyst,Goldman Sachs,...,Apr 2021,Apr 2021,Women in Trading and Technology (WiTT),Jane Street,Sep 2020,Sep 2020,Quantitative Trading Summer Intern,Optiver,Jul 2020,Aug 2020
1,Nicholas Hwong,Future Trainee Solicitor at Ashurst,Ashurst,"London, England, United Kingdom",Ashurst,7 mos,"London, England, United Kingdom","London, England, United Kingdom",Vacation Scheme Offer Holder,Linklaters,...,Jan 2021,Jul 2021,First Year Insight Scheme,Bryan Cave Leighton Paisner LLP,Dec 2020,Dec 2020,Mini Pupil,Alexandra Chambers,Aug 2020,Aug 2020
2,Qi Chen,Summer Researcher at University of Oxford,英国牛津大学,"Oxford, England, United Kingdom",Summer Researcher,英国牛津大学,Jun 2021,Sep 2021,Data Mining Intern,NetEase Games,...,Jul 2019,Oct 2019,,,,,,,,
3,Peter Zhang,Final Year Engineering Student at Oxford,University of Oxford,"Oxford, England, United Kingdom",Macro Trading Intern,BlueCrest Capital Management,Sep 2021,Present,Macro Trading Intern,Rokos Capital Management,...,Jul 2020,Aug 2020,,,,,,,,
4,Shamil Amirov,Summer Analyst at Morgan Stanley,UCL,"London, England, United Kingdom",Summer Analyst,Morgan Stanley,Jun 2021,Aug 2021,Summer Intern,EY,...,Jun 2019,Jul 2019,Activity Leader,Our World English Schools Limited,Jul 2017,Jul 2019,Data Analyst Internship,Troika Property,Jun 2018,Sep 2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,Luca Iovino,"Risk Consultant, Mazars",Mazars,"London, England, United Kingdom",Risk Consultant,Mazars,Nov 2020,Present,Intern,LGT Vestra LLP,...,Sep 2017,Sep 2017,,,,,,,,
100,Andy Chan,Investment Banking Analyst at Barclays Investm...,Barclays Investment Bank,Hong Kong SAR,Investment Banking Analyst,Barclays Investment Bank,Jul 2021,Present,Summer Analyst,Tree Line Investment Management,...,Nov 2018,Jul 2021,Investment Banking Summer Analyst,Barclays Investment Bank,Jul 2020,Aug 2020,President,LSESU Asia Careers Society,Mar 2019,Mar 2020
101,Jeremy Tai Hong Chang,Graduate Mechanical Engineer at Transport for ...,Transport for London,United Kingdom,Graduate Mechanical Engineer,Transport for London,Sep 2021,Present,Simulation Engineer,Johnson Electric,...,Jul 2018,Aug 2018,,,,,,,,
102,Louis Wright,"Materials Science, Oxford / OxBikes",OxBikes,"London, England, United Kingdom",Founder,OxBikes,Aug 2021,Present,Oxford University Rugby Club Marketing,Oxford University Rugby Football Club,...,Oct 2021,Dec 2021,Business Analyst Intern,White Space Strategy Ltd,Aug 2021,Sep 2021,Professional Services & Consulting internship ...,Bright Network,Jul 2021,Aug 2021


In [77]:
rightDiv

Export data into csv file.

In [339]:
infoDF.to_csv('linkedin-info.csv', index=False)