# [Smart-lab](https://smart-lab.ru/news/) parse

In [5]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd

from datetime import date, timedelta
import pickle
import time
import os 
from tqdm import tqdm
import re

In [2]:
# Scraper's options
options = Options()

# Fully load the page to avoid some problems
options.page_load_strategy = 'normal'

# To avoid scraper detection and other problems
options.add_argument("start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")

# Some other features
options.add_argument("--disable-notifications")
options.add_argument("--mute-audio")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless=new")

In [152]:
# Firstly we want to get all news' links
def parse_links_smart_lab(webdriver_options, lst_dates, lst_links):
    driver = webdriver.Chrome(options=webdriver_options)
    
    for i in tqdm(range(len(lst_dates))):
        driver.get(f'https://smart-lab.ru/news/date/{lst_dates[i]}')

        elements = driver.find_elements(By.XPATH, '//div[@class="topic allbloglist"]//h3//div[@class="inside"]//a')
        for element in elements:
            lst_links.append(element.get_attribute('href'))

        try:
            # For each date there is no more than 2 pages
            next_page_btn = driver.find_element(By.XPATH, '//*[@id="pagination"]/a[2]')
            next_page_btn.click()
            time.sleep(1)
            elements = driver.find_elements(By.XPATH, '//div[@class="topic allbloglist"]//h3//div[@class="inside"]//a')
            for element in elements:
                lst_links.append(element.get_attribute('href'))
        except:
            pass

        if i % 364 == 0 and i != 0:
            print(len(lst_links))

    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [6]:
dates_span = abs((date(2019, 1, 1) - date(2023, 12, 31)).days)
dates_lst = [(date(2019, 1, 1) + timedelta(days=i)).strftime('%Y-%m-%d') for i in range(dates_span)]
lst_links = []

In [1]:
# Collecting the news' links for these dates
parse_links_smart_lab(options, dates_lst, lst_links)

In [161]:
# Checking that we do not have duplicaet links
assert len(lst_links) == len(set(lst_links))

In [2]:
# Writing links to file to avoid parcing each time
with open(r'smart_lab_links.txt', 'w') as file:
    for i in range(len(lst_links)):
        file.write("%s\n" % lst_links[i])

### Output format (columns)
1. id (pandas.DF) - default column

2. website (where the news were retreived)

3. section of the website (where the news were retreived)

4. url (of the news)

5. header (of the news)

6. body (of the news)

7. date (of the news)

8. tags/key_words (of the news, if there are any)

### This dataset

1. default

2. Smart_Lab (same for each url)

3. "Новости компаний и новости по акциям" (same for each url)

4. url (unique for each url)

5. header (unique for each url)

6. body (unique for each url)

7. date (unique for each url)

8. tags/key_words (unique for each url)

In [6]:
def parse_news_on_list_of_links(webdriver_options, urls_lst,
                                body_lst, header_lst, date_lst,
                                key_words_lst, new_urls_lst):
    
    regex_for_external_links = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

    driver = webdriver.Chrome(options=webdriver_options)
    
    for i in tqdm(range(len(urls_lst))):
        driver.get(urls_lst[i])

        body = driver.find_element(By.XPATH, '//div[@id="content"]//div//div[@class="content"]').text
        body_without_external_links = re.sub(regex_for_external_links, '', body)
        if body_without_external_links.strip() == '':
            continue

        header = driver.find_element(By.XPATH, '//div[@id="content"]//div//h1//span').text

        date = driver.find_element(By.XPATH, '//li[@class="date"]').text
        
        key_words_phrase = driver.find_element(By.XPATH, '//ul[@class="tags"]')
        key_words_paths = key_words_phrase.find_elements(By.XPATH, './/li')
        key_words = [key_words_paths[i].text.replace(',','') for i in range(1, len(key_words_paths))]
        
        body_lst.append(body_without_external_links)
        header_lst.append(header)
        date_lst.append(date)
        key_words_lst.append(key_words)
        new_urls_lst.append(urls_lst[i])

    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [7]:
urls_lst = []
new_urls_lst = []
body_lst = []
header_lst = []
date_lst = []
key_words_lst = []

# Reading links from the file
with open(r'smart_lab_links.txt', 'r') as file:
    urls_lst = file.read().splitlines()

In [3]:
# Parse all links collected
parse_news_on_list_of_links(webdriver_options=options,
                            urls_lst=urls_lst,
                            body_lst=body_lst,
                            header_lst=header_lst,
                            date_lst=date_lst,
                            key_words_lst=key_words_lst, 
                            new_urls_lst=new_urls_lst)

In [4]:
# Some additional check
print(len(urls_lst), '\n')

print(len(body_lst))
print(len(header_lst))
print(len(date_lst))
print(len(key_words_lst))
print(len(new_urls_lst))

In [10]:
# Create columns with the same values for each url
website_lst = ['Smart_Lab' for _ in range(len(body_lst))]
section_lst = ['Новости компаний и новости по акциям' for _ in range(len(body_lst))]

In [11]:
# Crate pandas DataFrame
df = pd.DataFrame(
    {
        'website': website_lst,
        'section': section_lst, 
        'url': new_urls_lst,
        'header': header_lst,
        'body': body_lst,
        'date': date_lst,
        'key_words': key_words_lst
    }
)

In [13]:
# Save parced data
df.to_parquet('smart_labs_parced_data.parquet', index=False)