In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions

from selenium.common.exceptions import NoSuchElementException , ElementNotInteractableException ,StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from newspaper import Article
import os

import time
import re
import csv
from tqdm.auto import tqdm
import datetime

In [2]:
service = Service('M:/chromedriver-win64/chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service= service)

In [4]:
# file where infos are saved
file = 'papers.csv'
with open(file , 'w',encoding='utf-8') as f :
    header =  ['Title','Author','Abstract','Submission_date','URL']
    writer = csv.writer(f , delimiter = ';')
    writer.writerow(header)

# Scrapping arxiv

In [184]:
class ArxivScrapper():
    
    def __init__(self,keyword):
        self.research_key = keyword
        self.month_list = ['january','february','march','april',
                           'may','june','july','august','september',
                           'october','november','december']
        
        self.url = 'https://arxiv.org/'
        self.file = 'papers.csv'
        
    def make_search(self):
        driver.get(self.url)
        search_bar = driver.find_element(By.NAME , 'query')
        subject = self.research_key
        search_bar.send_keys(subject)
        search_bar.send_keys(Keys.ENTER)
        
    def scrape_an_offer(self,paper):
        title = paper.find_element(By.CLASS_NAME , 'title').text
        authors = paper.find_element(By.CLASS_NAME , 'authors').text
        
        try : 
            more_button = paper.find_element(By.CLASS_NAME ,'is-size-7')
            more_button.click()
        except Exception :
            pass

        abstract = paper.find_element(By.CLASS_NAME ,'abstract-full').text

        submission_date = paper.find_element(By.CSS_SELECTOR , 'p.is-size-7').text.split(';')[0]
        submission_date = self.clean_date(submission_date)

        url_to_paper = paper.find_element(By.CSS_SELECTOR , 'p.list-title a').get_attribute('href')
        paper_infos = [title, authors, abstract, submission_date, url_to_paper]
        
        return paper_infos
    
    def scrape_all_offers(self):
        
        self.make_search()
        i = 0
        
        while True :
            paper_cards = driver.find_elements(By.CLASS_NAME , 'arxiv-result')
            for paper in tqdm(paper_cards):
                paper_infos = self.scrape_an_offer(paper)
                self.save_offer(paper_infos)

            try :
                if i<= 10 :
                    next_page_button = driver.find_element(By.CLASS_NAME ,'pagination-next')
                    next_page_button.click()
                    time.sleep(5) 
                    i += 1
                else :
                    break
                    
            except NoSuchElementException:
                    break
                
    
    def clean_date(self,date, source = 'arxiv'):
        date = re.sub('Submitted |\,','',date)
        day, month, year = date.split()
        
        month_idx = self.month_list.index(month.lower()) + 1
        date = datetime.date(int(year),month_idx,int(day) )
        return date
    
    def save_offer(self ,infos):
        with open(self.file , 'a',encoding='utf-8') as f :
            writer = csv.writer(f , delimiter = ';')
            writer.writerow(infos)
    

In [185]:
scraper = ArxivScrapper('computer vision')
scraper.scrape_all_offers()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

# Scraping Medium

In [168]:
class MediumScrapper():
    def __init__(self):
        self.month_list = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
        self.url = 'https://medium.com/nlplanet'
        self.paper_file = 'papers.csv'
        self.guide_file = 'guides.csv'
        self.news_file = 'news.csv'
        self.author = 'Fabio Chiusano'
        
    def make_search(self):
        driver.get(self.url)
        
    def scrape_one_news(self,new):
        new_title = new.find_element(By.CSS_SELECTOR ,'a.af').text
        details = new.text
        url_to_source = new.find_element(By.CSS_SELECTOR ,'a.af').get_attribute('href')

        try :
             date = driver.find_element(By.XPATH ,'//span[@data-testid = "storyPublishDate"]').text
        except NoSuchElementException :
            date = driver.find_element(By.CSS_SELECTOR ,'span.be div.ab.ae').text.split('\n')[-1]

        date = self.clean_date_medium(date)
        news_infos = [new_title , self.author , details ,date ,url_to_source ]
        
        return news_infos
    
    def scrape_one_guide(self,guide) :
        guide_title = guide.find_element(By.CSS_SELECTOR ,'a.af').text
        guide_details = guide.text
         
        try :
             date = driver.find_element(By.XPATH ,'//span[@data-testid = "storyPublishDate"]').text
        except NoSuchElementException :
            date = driver.find_element(By.CSS_SELECTOR ,'span.be div.ab.ae').text.split('\n')[-1]
            
        url_to_source = guide.find_element(By.CSS_SELECTOR ,'a.af').get_attribute('href')
        guide_infos = [guide_title , self.author , guide_details ,date ,url_to_source ]
        
        return guide_infos
    
    def scrape_one_article(self, paper):
        paper_title = paper.find_element(By.CSS_SELECTOR ,'a.af').text
        paper_details = paper.text
        url_to_paper = paper.find_element(By.CSS_SELECTOR ,'a.af').get_attribute('href')
        
        try :
             date = driver.find_element(By.XPATH ,'//span[@data-testid = "storyPublishDate"]').text
        except NoSuchElementException :
            date = driver.find_element(By.CSS_SELECTOR ,'span.be div.ab.ae').text.split('\n')[-1]
            
        paper_infos = [paper_title , self.author , paper_details ,date ,url_to_paper]
        return paper_infos
        
    def scrape_all_offers(self):
        
        self.make_search()
        current_url = driver.current_url
        
        for article_number in tqdm(range(5)): 
            
            try :
                articles = WebDriverWait(driver,10,ignored_exceptions = StaleElementReferenceException).until(EC.presence_of_all_elements_located((By.TAG_NAME ,'article'))) 
                article = articles[article_number]
                article.click()


                assert driver.current_url != current_url
                
            except AssertionError:
                print(f'Broken for page {article_number}')
                continue

            news , guides, papers = WebDriverWait(driver,10,ignored_exceptions = StaleElementReferenceException).until(EC.presence_of_all_elements_located((By.TAG_NAME ,'ul'))) 
            
            # Scrapping the news
            news_element = news.find_elements(By.TAG_NAME ,'li')
            for new in news_element :
                new_infos = self.scrape_one_news(new)
                self.save_offer(self.news_file,new_infos)
                
            # Scrapping the guides
            guide_elements = guides.find_elements(By.TAG_NAME ,'li')
            for guide in guide_elements :
                guides_info = self.scrape_one_guide(guide)
                self.save_offer(self.guide_file,guides_info)
                
            # Scrapping the papers
            paper_elements = papers.find_elements(By.TAG_NAME ,'li')
            for paper in paper_elements :
                paper_info = self.scrape_one_article(paper)
                self.save_offer(self.paper_file , paper_info)
                
            driver.back()
            
                
    
    def clean_date_medium(self, date):
        if date.find('hour') != -1:
            date = datetime.date.today()
            
        elif date.find('ago') != -1 :
            day = datetime.date.today().day -  int(date.split(' ')[0])
            date = datetime.date(2024,datetime.date.today().month,day)
        else :
            day = date.split()[1][:-1]
            month = self.month_list.index(date.split()[0]) + 1
            date = datetime.date(2024 ,month,int(day))
        return date
    
    def save_offer(self ,file, infos):
        with open(file , 'a',encoding='utf-8') as f :
            writer = csv.writer(f , delimiter = ';')
            writer.writerow(infos)
    

In [186]:
scraper = MediumScrapper()
scraper.scrape_all_offers()

  0%|          | 0/5 [00:00<?, ?it/s]

Broken for page 1


# Scrapping Paper_with_code

In [116]:
class PWCScrapper():  
    def __init__(self):
        self.month_list = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
        
        self.url = 'https://paperswithcode.com/'
        self.file = 'papers.csv'
        
    def make_search(self):
        driver.get(self.url)

        
    def scrape_an_offer(self,paper):
        paper_title = driver.find_element(By.TAG_NAME , 'h1').text
        authors_span = driver.find_elements(By.CLASS_NAME,'author-span')

        publish_date = authors_span[0].text
        publish_date = self.clean_date(publish_date)

        authors = ' ,'.join(author.text for author in authors_span[1:4])
        abstract = driver.find_element(By.CSS_SELECTOR , 'div.paper-abstract p').text
        url_to_paper = driver.find_element(By.CSS_SELECTOR , 'div.paper-abstract a').get_attribute('href')

        paper_infos = [paper_title , authors , abstract ,publish_date ,url_to_paper ]

        return paper_infos
    
    def scrape_all_offers(self):
        
        self.make_search()
        articles = driver.find_elements(By.CSS_SELECTOR,'div.item-content h1 a')
        
        for article in tqdm(articles) : 
            driver.execute_script("arguments[0].click();", article)
            paper_infos = self.scrape_an_offer(article)
            self.save_offer(paper_infos)
            driver.back()
            time.sleep(2)
            
    
    def clean_date(self,date):
        try:
            date = re.sub('Submitted |\,','',date)
            day, month, year = date.split()

            month_idx = self.month_list.index(month.lower()) + 1
            date = datetime.date(int(year),month_idx,int(day))
        except ValueError :
            date = datetime.date(2024,1,1)
        return date
    
    def save_offer(self ,infos):
        with open(self.file , 'a',encoding='utf-8') as f :
            writer = csv.writer(f , delimiter = ';')
            writer.writerow(infos)
    

In [117]:
scraper = PWCScrapper()
scraper.scrape_all_offers()

  0%|          | 0/10 [00:00<?, ?it/s]