In [48]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.wait import WebDriverWait

import time
import re
import random
import csv
import os.path
import yaml

from pathlib import Path
from tqdm import tqdm

In [66]:
creds = yaml.safe_load(Path(r'../credentials.yml').read_text())

USER_LOGIN = creds['user']['USER_LOGIN']
USER_PASSWORD = creds['user']['USER_PASSWORD']

SCROLL_PAUSE_TIME = 0.5

In [65]:
def get_time(base: int) -> int:
    '''
    Returns randomized time shift, sometimes multiply shift by base.
    '''
    factor = random.randint(1, 20)
    if factor == 1:
        result = base + base * random.randint(1, 10)
    else:
        result = base + random.randint(1, 10)
    return result

In [13]:
def session_init() -> None:
    '''
    Initialize session.
    '''
    global driver
    caps = DesiredCapabilities().CHROME
    caps['pageLoadStrategy'] = 'eager'
    driver = webdriver.Chrome()

In [18]:
def log_in() -> None:
    '''
    Loggin in.
    '''
    try:
        driver.find_element(By.CLASS_NAME, 'global-nav__me-photo')
        return None
    except:
        driver.get("https://linkedin.com/uas/login")
        time.sleep(get_time(5))
        username = driver.find_element(By.ID, "username")
        username.send_keys(USER_LOGIN)
        time.sleep(get_time(5))
        pword = driver.find_element(By.ID, "password")
        pword.send_keys(USER_PASSWORD)
        time.sleep(get_time(5))
        driver.find_element(By.XPATH, "//button[@type='submit']").click()
        time.sleep(get_time(5))

In [63]:
confs = yaml.safe_load(Path(r'../configuration.yml').read_text())
keywords = []
for title in confs['link_parsing']['titles']:
    for prof in confs['link_parsing']['profs']:
        keywords.append((title + ' ' + prof).strip().replace(' ','%20'))

In [7]:
def csv_write(data: list, path: str, header: list=['account_link', 'search_keywords']) -> None:
    '''
    Write new line to csv file, if doen't exist - creates one with header.
    '''
    if not os.path.isfile(path):
        with open (path, 'a', encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(header)
    with open (path, 'a', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [67]:
def scroll_page() -> None:
    '''
    Scroll down till end of the page to make sure there is "Next" button.
    '''
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [70]:
def parse_links(page_num: int=1, path: str=Path(r'..\data\raw\data_frame.csv'), keywords: list=keywords) -> None:
    '''
    Search for keywords, navigate through pages and save links to path file.
    '''
    for keyword in tqdm(keywords, desc='Keywords: '):
        driver.get(
            'https://www.linkedin.com/search/results/people/?keywords='
            + keyword
            + '=GLOBAL_SEARCH_HEADER&sid=QDs'
        )
        for i in tqdm(range(page_num), desc='Pages: '):
            search_result_links = driver.find_elements(By.CSS_SELECTOR, "div.entity-result__item a.app-aware-link")
            for link in search_result_links:
                href = link.get_attribute("href")
                if 'linkedin.com/in' in href:
                    string = [href[:href.rfind('?miniProfileUrn')], keyword.replace('%20', ' ')]
                    csv_write(string, path)
            scroll_page()
            try:
                next_button = WebDriverWait(driver, timeout=30).until(
                    lambda d: d.find_element(By.CLASS_NAME, 'artdeco-pagination__button--next')
                )
                next_button.click()
                time.sleep(get_time(5))
            except:
                break

In [46]:
session_init()
log_in()

In [71]:
parse_links(page_num=1)

Keywords:   0%|                                                                                 | 0/36 [00:00<?, ?it/s]
Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.17s/it][A
Keywords:   3%|██                                                                       | 1/36 [00:17<09:59, 17.13s/it]
Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.13s/it][A
Keywords:   6%|████                                                                     | 2/36 [00:29<08:01, 14.17s/it]
Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████

Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.23s/it][A
Keywords:  64%|██████████████████████████████████████████████                          | 23/36 [05:54<03:09, 14.54s/it]
Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.60s/it][A
Keywords:  67%|████████████████████████████████████████████████                        | 24/36 [06:08<02:55, 14.61s/it]
Pages:   0%|                                                                                     | 0/1 [00:00<?, ?it/s][A
Pages: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.16s/it][A
Keywords:  69%|███████

In [72]:
# close the Chrome browser
driver.quit()

In [73]:
input()

asd


'asd'