In [2]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from user_agent import generate_user_agent
import selenium
from selenium.webdriver import Firefox, Chrome, Remote
from selenium.webdriver.common.proxy import *
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
from time import sleep

In [83]:
def find_abstract(name, page):
    '''
    Пытается найти абстракт со страницы page,
    если его нет, то вызывает get_abstract.
    В результате выводит абстракт или "", если get_abstract также не нашел его на сайте издателя.
    '''
    abstract = page.find('p').text
    key = 'https://doi'
    if abstract:
        return abstract
    else:
        for i in page.findAll('a'):
            try:
                if key in i['href']:
                    adress_r = i['href']
                    break
            except:
                continue
        try:
            browser.get(adress_r)
            ppp = BeautifulSoup(browser.page_source, "html.parser")
            return get_abstract(name, ppp)
        except:
            return ''
        
def get_abstract(name, page):
    '''
    Функция возвращает абстакт для журнала name
    '''
    def nature(page):
        try: return page.findAll('p')[5].text
        except: return ''
    def jetp(page):
        try: return page.findAll('p')[3].text
        except: return ""
    def physrev(page):
        try:
            mass_txt = p.find('p').text.split('&lt;math')
            abstract = ''
            for i in mass_txt: 
                abstract += i.split('math&gt;')[-1]
            return abstract
        except: return ""
    
    func = {
        'NATURE_PHYSICS' : nature,
        'JETP' : jetp,
        'JETP_LETTERS' : jetp,
        'PHYSICAL_REVIEW_B': physrev,
        'PHYSICAL_REVIEW_LETTERS': physrev,
        'PHYSICAL_REVIEW_C': physrev
    }
    return func[name](page)

def year(page):
    '''
    Возвращает год публикации, если не указан, то возвращает 1000
    '''
    try:
        for i in page.findAll('font',attrs={'color':['#00008f']}) :
            if i.text[:3] == '201':
                return i.text
    except:
        return '1000'
        
def get_articles_adress(page):
    '''
    Возвращает список адресов всех статей
    '''
    articles_adress = []
    id_str = '/item.asp?id='
    for i in page.findAll('a'):
        try:
            if id_str in i['href']:
                num_id = i['href']
                articles_adress.append('https://elibrary.ru' + num_id)
        except:
            continue
    return articles_adress

def get_authors_title(page):
    mas = []
    for i in page.findAll('input'):
        try:    
            if not i['value'][0].isdigit() and (i['value'][0] == i['value'][0].upper()):
                mas.append(i['value'])
        except:
            continue
    if len(mas) < 2:
        mas.insert(0, 'Incognito')
    return mas

def get_quot(page, adress):
    try:
        return page.findAll('a', attrs= {'href':['cit_items.asp?id=' + adress.split('=')[-1]]})[0].text
    except:
        return '0'

def find_data(browser, name, adress):
    '''
    Возвращает list из [названия статьи, авторы, число цитирований, абстракт]
    '''
    soup = BeautifulSoup(browser.page_source, "html.parser")
    sleep(np.random.rand() * 2)
    data = []
    data.append(year(soup))
    # Добавление Автора и Названия статьи
    authors, title = get_authors_title(soup)
    data.append(authors)
    data.append(title)
    # Добавление количества цитирований
    data.append(get_quot(soup, adress))
    # Добавление абстракта
    data.append(find_abstract(name, soup))
    sleep(1 + 3 * np.random.random())
    browser.back()
    return data

def run_to_actual_page(page_num,browser,xp_main):
    for i in range(page_num):
        sleep(7)
        search_form = browser.find_element_by_xpath(xp_main)
        search_form.click()                                                                  # Жмякаем на кнопку

def data_frame(name):
    try:
        return pd.read_csv(name,sep="@")
    except:
        return pd.DataFrame(columns=['Year','Authors', 'Title', 'Qoutes', 'Abstract', 'Link'])
    
def parse_journal(name, browser, page_num):
    print(name)
    print()
    columns = ['Year', 'Authors', 'Title', 'Qoutes', 'Abstract', 'Link']
    articles = data_frame(name)
    xp_main = '//*[@id="pages"]/table/tbody/tr/td[13]/a'
    run_to_actual_page(page_num, browser, xp_main)
    pp = BeautifulSoup(browser.page_source, "html.parser")
    runs = int(pp.find('font',attrs={'color':['#ff0000']}).text) // 20
    print('runs:', runs)
    sleep(5)
    for loop in range(page_num, runs):
        
        search_form = browser.find_element_by_xpath(xp_main)
        page = BeautifulSoup(browser.page_source, "html.parser")                             
        new_id = get_articles_adress(page)                                                   
        if loop % 2:
            sleep(5 + 10 * np.random.random())
        for i in np.random.choice(range(len(new_id)), 15, replace=False):
            if i % 3 == 0:
                sleep(3 * np.random.random() + 1.5 * np.random.random())
            browser.get(new_id[i])                         
            d = find_data(browser, name, new_id[i])
            d.append(new_id[i])                                                                    
            print("___{0}___{1}___".format(loop, i), end='\r')
            sleep(2 + 5 * np.random.random())
            browser.back()
            articles = articles.append(pd.Series(d, index=columns), ignore_index=True)
            articles.to_csv(name, sep="@")
        search_form = browser.find_element_by_xpath(xp_main)
        search_form.click()                                                                  
        sleep(5)
        

In [84]:
browser = Chrome(executable_path="./chromedriver")   
journals = {
    'NATURE_PHYSICS'    : 'https://elibrary.ru/title_items.asp?id=25368',
    'JETP' : 'https://elibrary.ru/title_items.asp?id=7467',
    'JETP_LETTERS' : 'https://elibrary.ru/title_items.asp?id=7468',
    'PHYSICAL_REVIEW_B' : "https://elibrary.ru/title_items.asp?id=21814",
    'PHYSICAL_REVIEW_C' : "https://elibrary.ru/title_items.asp?id=21815",
    'PHYSICAL_REVIEW_LETTERS' : "https://elibrary.ru/title_items.asp?id=21820",
    
}

    

In [90]:
j = list(journals.items())
j[3]

('PHYSICAL_REVIEW_B', 'https://elibrary.ru/title_items.asp?id=21814')

In [91]:
browser.get(j[3][1])
#r = input()
parse_journal(j[3][0], browser, 1)

PHYSICAL_REVIEW_B

runs: 3094
___1___0____

KeyboardInterrupt: 

In [79]:
browser = Chrome(executable_path="./chromedriver")   
browser.get('https://journals.aps.org/prc/abstract/10.1103/PhysRevC.92.064002')
p = BeautifulSoup(browser.page_source, 'html.parser')
# get_abstract('JETP_LETTERS', p)