# LinkedIn Scraper

In [7]:
from linkedin_scraper import Person, Job, JobSearch, actions
from selenium import webdriver
import time
import json
from selenium.webdriver.common.by import By
from dotenv import load_dotenv
import os
from pprint import pprint

In [2]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongouri"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [3]:
driver = webdriver.Chrome()

load_dotenv()
email = os.getenv("LINKEDIN_EMAIL")
password = os.getenv("LINKEDIN_PASSWORD")
actions.login(driver, email, password) 
person = Person("https://www.linkedin.com/in/andrewagirgis/", driver=driver, scrape=False)

In [None]:
# from selenium.webdriver.common.by import By
# from selenium.common.exceptions import NoSuchElementException

# def extract_experience_items(driver):
#     experience_data = []
#     try:
#         experience_section = driver.find_element(By.ID, "experience")
#         container = experience_section.find_element(By.XPATH, "../../..")
#         items = container.find_elements(By.CSS_SELECTOR, "li.artdeco-list__item")

#         for item in items:
#             try:
#                 title_elem = item.find_element(By.CSS_SELECTOR, "div.t-bold span[aria-hidden='true']")
#                 title = title_elem.text.strip()
#             except NoSuchElementException:
#                 title = None

#             try:
#                 # Get only the first company-like span that isn't duration or location
#                 spans = item.find_elements(By.CSS_SELECTOR, "span.t-14.t-normal span[aria-hidden='true']")
#                 company = spans[0].text.strip() if spans else None
#             except NoSuchElementException:
#                 company = None

#             try:
#                 dates_elem = item.find_element(By.XPATH, ".//span[contains(@class, 't-black--light')][1]/span[@aria-hidden='true']")
#                 dates = dates_elem.text.strip()
#             except NoSuchElementException:
#                 dates = None

#             try:
#                 location_elem = item.find_element(By.XPATH, ".//span[contains(@class, 't-black--light')][2]/span[@aria-hidden='true']")
#                 location = location_elem.text.strip()
#             except NoSuchElementException:
#                 location = None

#             try:
#                 description_elem = item.find_element(By.CSS_SELECTOR, "div.inline-show-more-text--is-collapsed span[aria-hidden='true']")
#                 description = description_elem.text.strip().replace("\n", " ")
#             except NoSuchElementException:
#                 description = None

#             experience_data.append({
#                 "Title": title,
#                 "Company": company,
#                 "Date Range": dates,
#                 "Location": location,
#                 "Description": description
#             })

#     except Exception as e:
#         print(f"Error extracting experience section: {e}")

#     return experience_data


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
from dotenv import load_dotenv
from linkedin_scraper import actions

def scrape_linkedin_profile(driver, profile_url):
    driver.get(profile_url)
    time.sleep(5)

    profile_data = {}

    # Name
    try:
        name_elem = driver.find_element(By.CSS_SELECTOR, "main h1.t-24")
        profile_data['Name'] = name_elem.text
    except:
        profile_data['Name'] = None

    # Headline
    try:
        headline_elem = driver.find_element(By.CSS_SELECTOR, "div.text-body-medium.break-words")
        profile_data['Headline'] = headline_elem.text
    except:
        profile_data['Headline'] = None

    # Location
    try:
        location_elem = driver.find_element(By.CSS_SELECTOR, "main span.text-body-small")
        profile_data['Location'] = location_elem.text
    except:
        profile_data['Location'] = None

    # About Section
    try:
        about_section = driver.find_element(By.ID, "about")
        about_text = about_section.find_element(By.XPATH, "..").text.split("\n")
        about_text = [line.strip() for line in about_text if line.strip().lower() != "about"]
        about_text = "\n".join(dict.fromkeys(about_text))  # remove duplicates, preserve order
        profile_data['About'] = about_text
    except:
        profile_data['About'] = None

    # Scroll down to load experience and education
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)

    # Experience
    try:
        experience_section = driver.find_element(By.ID, "experience")
        full_text = experience_section.find_element(By.XPATH, "..").text

        # Clean repeated headers like "Experience", "Top skills", etc.
        lines = full_text.split("\n")
        cleaned_lines = []
        seen = set()

        for line in lines:
            if line.strip() and line not in seen and not line.lower() in {"experience", "top skills", "show all experiences"}:
                cleaned_lines.append(line.strip())
                seen.add(line)

        profile_data['Experience'] = cleaned_lines
    except Exception as e:
        print(f"⚠️ Failed to scrape experience: {e}")
        profile_data['Experience'] = None

    # Education
    try:
        education_section = driver.find_element(By.ID, "education")
        raw_text = education_section.find_element(By.XPATH, "..").text.split("\n")

        cleaned_education = []
        seen = set()
        for line in raw_text:
            line = line.strip()
            if line and line.lower() not in {"education", "show all 3 educations"} and line not in seen:
                cleaned_education.append(line)
                seen.add(line)

        profile_data['Education'] = cleaned_education
    except Exception as e:
        print(f"⚠️ Failed to scrape education: {e}")
        profile_data['Education'] = None

    return profile_data

# ---- Run the Scraper ----

profile_url = "https://www.linkedin.com/in/andrewagirgis/"
profile_info = scrape_linkedin_profile(driver, profile_url)

# Nicely formatted output
for key, value in profile_info.items():
    print(f"{key}:\n")
    if isinstance(value, list):
        for item in value:
            print(f"- {item}")
    else:
        print(value)
    print("-" * 50)


Name:

Andrew Girgis
--------------------------------------------------
Headline:

Master's student in Economics and Computational Data Analytics | Aspiring Data Scientist and Entrepreneur
--------------------------------------------------
Location:

(He/Him)
--------------------------------------------------
About:

An aspiring data scientist with a strong foundation in econometrics, data visualization, and machine learning.

I'm currently pursuing a Master of Arts in Economics and a Graduate Diploma in Computational Data Analytics at the University of Waterloo, where I’m deepening my expertise in leveraging data to solve complex problems and present the data in a meaningful way. With a Bachelor of Arts Honours in Economics and a minor in Applied Computing from Brock University, I've built a robust analytical and technical skill set. My proficiency in Python, R, SQL, and key libraries like pandas, NumPy, and TensorFlow allows me to translate data into actionable insights and impactful

In [5]:
input("Press Enter")
job = Job("https://www.linkedin.com/jobs/view/4209878123/", driver=driver, close_on_complete=False)

In [6]:
print(type(job))

<class 'linkedin_scraper.jobs.Job'>


In [None]:
for attr in dir(job):
    if not attr.startswith("_"):
        print(f"{attr}: {getattr(job, attr)}")



In [8]:
pprint(dir(job))           # list every attribute / method
print(getattr(job, "__dict__", {}))  # show instance vars


['TOP_CARD',
 'WAIT_FOR_ELEMENT_TIMEOUT',
 '__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__find_element_by_class_name__',
 '__find_element_by_xpath__',
 '__find_enabled_element_by_xpath__',
 '__find_first_available_element__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__replace__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'applicant_count',
 'benefits',
 'company',
 'company_linkedin_url',
 'driver',
 'focus',
 'is_signed_in',
 'job_description',
 'job_title',
 'linkedin_url',
 'location',
 'mouse_click',
 'posted_date',
 'scrape',
 'scrape_logged_in',
 'scroll_class_name_element_to_page_percent'

In [10]:
doc = job.to_dict()     # <-- one call does the flattening
print(doc)

{'linkedin_url': 'https://www.linkedin.com/jobs/view/4209878123/', 'job_title': 'Economic Analyst', 'company': 'TD', 'company_linkedin_url': 'https://www.linkedin.com/company/td/life', 'location': 'Toronto, ON · 1 week ago · Over 100 people clicked apply', 'posted_date': '1 week ago', 'applicant_count': 0, 'job_description': "About the job\nWork Location:\n\nToronto, Ontario, Canada\n\nHours:\n\n37.5\n\nLine Of Business:\n\nFinance\n\nPay Details:\n\n$65,600 - $98,400 CAD\n\nThis role is eligible for a discretionary variable compensation award that considers business and individual performance.\n\nTD is committed to providing fair and equitable compensation opportunities to all colleagues. Growth opportunities and skill development are defining features of the colleague experience at TD. Our compensation policies and practices have been designed to allow colleagues to progress through the salary range over time as they progress in their role. The base pay actually offered may vary base

In [None]:
search_query = "Machine Learning Engineer"
location = "Canada"
driver.get(f"https://www.linkedin.com/jobs/search/?keywords={search_query}&location={location}")

time.sleep(5)  # Let the page load

# Then extract job card links manually
jobs = driver.find_elements(By.CLASS_NAME, "job-card-container__link")
job_urls = [job.get_attribute("href") for job in jobs]
job_urls

In [None]:
search_query = "data scientist"
location = "Canada"
num_pages = 5  # how many pages you want to scrape
job_urls = []

for page in range(num_pages):
    start = page * 25  # LinkedIn paginates by 25
    url = f"https://www.linkedin.com/jobs/search/?keywords={search_query}&location={location}&start={start}"
    driver.get(url)
    time.sleep(5)  # wait for page to load

    jobs = driver.find_elements(By.CLASS_NAME, "job-card-container__link")
    urls = [job.get_attribute("href") for job in jobs if job.get_attribute("href")]
    job_urls.extend(urls)

    print(f"Page {page+1}: Collected {len(urls)} job URLs")

print(f"\nTotal collected: {len(job_urls)} job listings")