In [1]:
json_file = {
    "TDS":{"url":"https://towardsdatascience.com/latest",
           "xpath_title":"//*[contains(@class,'graf') and contains(@class,'graf--h3') and contains(@class,'graf-after--figure') and contains(@class,'graf--title')]",
           "xpath_date":"//time[@datetime]",
           "xpath_subtitle":"//*[@class='graf graf--h4 graf-after--h3 graf--trailing graf--subtitle']"
           },

    "DSC":{"url":"https://www.datasciencecentral.com/articles/",
           "xpath_title":"//*[contains(@class,'blog-entry-title entry-title')]/a[@rel='bookmark']",
           "xpath_date":"//*[contains(@class,'entry-date published')]",
           "xpath_subtitle":"//*[@class='excerpt-wrap entry-summary']"},
    
    "LevelUp":{"url":"https://levelup.gitconnected.com/",
               "xpath_title":"//*[@class='u-letterSpacingTight u-lineHeightTighter u-breakWord u-textOverflowEllipsis u-lineClamp3 u-fontSize24']",
               "xpath_date":"//time[@datetime]",
               "xpath_subtitle":"//*[@class='u-contentSansThin u-lineHeightBaseSans u-fontSize24 u-xs-fontSize18 u-textColorNormal u-baseColor--textNormal']"}

}

In [2]:
json_file['LevelUp']["xpath_date"]

'//time[@datetime]'

In [3]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime, timedelta
import logging

In [4]:
chr_options = Options()
chr_options.add_experimental_option("detach", True)
chr_options.add_experimental_option('excludeSwitches', ['enable-logging'])
service = Service('C:/Users/Chesta/Desktop/data_scraping/chromedriver.exe')

In [5]:
def setup_driver(chrome_service, chrome_options):
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
    driver.maximize_window()
    return driver

In [6]:
def parse_towards_ds_date(date_element):
    date_string = date_element.get_attribute('datetime')[:-1]  # Adjust attribute as needed
    return datetime.fromisoformat(date_string)

def parse_data_science_central_date(date_element):
    date_string = date_element.get_attribute('content')  # Adjust attribute as needed
    return datetime.strptime(date_string, "%Y-%m-%d")


In [7]:
def scrape_articles(driver, url, xpath_title, xpath_subtitle,xpath_date, parse_date, days_back=30, scroll_pause_time=30):
    data = []
    end_date = datetime.now() - timedelta(days=days_back)
    skip_next = False
    
    try:
        driver.get(url)
        html = driver.find_element(By.TAG_NAME, "html")
        while True:
            titles = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, xpath_title)))
            subtitles = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, xpath_subtitle)))
            date_elements = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, xpath_date)))

            for title, subtitle,date_element in zip(titles, subtitles,date_elements):
                article_date = parse_date(date_element)  # Pass the entire element to the parse_date function

                if article_date < end_date:
                    # if skip_next:
                    raise StopIteration
                    # else:
                    #     skip_next = True
                    #     print(f"skipping:{title.text}")
                    #     continue
                # else:
                #     skip_next = False

                if not any(d['title'] == title.text for d in data):
                    print(f"Title: {title.text}, Date: {article_date}")
                    data.append({"title": title.text, "date": article_date,'SubTitle': subtitle.text if subtitle.text else None})

            # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            html.send_keys(Keys.PAGE_DOWN)
            time.sleep(scroll_pause_time)
    # except StopIteration:
    #     pass  # Reached the end of the required date range
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    finally:
        driver.quit()
    return data


In [18]:
towards_ds_url = "https://towardsdatascience.com/latest"
towards_ds_xpath_title = "//*[contains(@class,'graf') and contains(@class,'graf--h3') and contains(@class,'graf--title')]"
towards_ds_xpath_date = "//time[@datetime]"
towards_ds_xpath_subtitle= "//*[@class='graf graf--h4 graf-after--h3 graf--trailing graf--subtitle']"

data_science_central_url = "https://www.datasciencecentral.com/articles/"
data_science_central_xpath_title = "//*[contains(@class,'blog-entry-title entry-title')]/a[@rel='bookmark']"
data_science_central_xpath_date = "//*[contains(@class,'entry-date published')]"
data_science_central_xpath_subtitle="//*[@class='excerpt-wrap entry-summary']"


In [32]:
driver = setup_driver(service,chr_options)
towards_ds_data = scrape_articles(driver, towards_ds_url, towards_ds_xpath_title, towards_ds_xpath_subtitle,towards_ds_xpath_date,parse_towards_ds_date)
# data_science_central_data = scrape_articles(driver, data_science_central_url, data_science_central_xpath_title, data_science_central_xpath_date,parse_data_science_central_date)

Title: Solving Reasoning Problems with LLMs in 2023, Date: 2024-01-06 01:13:04.869000
Title: Demystifying Graph Neural Networks, Date: 2024-01-05 19:54:23.724000
Title: Low Quality Image Detection, Date: 2024-01-05 19:19:10.163000
Title: Learning Discrete Data with Harmoniums: Part I, The Essentials, Date: 2024-01-05 19:17:08.024000
Title: Boosting Algorithms in Machine Learning, Part I: AdaBoost, Date: 2024-01-05 19:06:55.589000
Title: Data Science Better Practices, Part 2 — Work Together, Date: 2024-01-05 18:57:40.209000
Title: How to Learn AI on Your Own (a self-study guide), Date: 2024-01-05 18:09:48.574000
Title: LLMs for Everyone: Running the LLaMA-13B model and LangChain in Google Colab, Date: 2024-01-05 16:06:29.233000
Title: AutoGluon-TimeSeries: Every Time Series Forecasting Model In One Library, Date: 2024-01-05 15:53:59.642000
Title: I Tried Data Analysis ChatGPT Plugin — Every Analyst’s Dream or a Nightmare in Disguise?, Date: 2024-01-05 15:48:39.329000
Title: Keep Track o

In [36]:
driver = setup_driver(service,chr_options)
data_science_central_data = scrape_articles(driver, data_science_central_url, data_science_central_xpath_title, data_science_central_xpath_subtitle,data_science_central_xpath_date,parse_data_science_central_date)

Title: Mitigating Ethical Risks in Generative AI: Strategies for a Safe and Secure AI Application, Date: 2024-01-03 00:00:00
Title: DSC Weekly 2 January 2024, Date: 2024-01-02 00:00:00
Title: Mastering IoT Data Management for Business Success, Date: 2024-01-01 00:00:00
Title: Generative AI business model disruption: The NYT lawsuit posturing , Date: 2024-01-01 00:00:00
Title: GenAI: Synthesizing DNA Sequences with LLM Techniques, Date: 2024-01-01 00:00:00
Title: GenAI: Beware the Productivity Trap; It’s About Economics – Part 1, Date: 2023-12-31 00:00:00
Title: The Best Kept Secret About LLMs, Date: 2023-12-24 00:00:00
Title: Data Monetization? Cue the Chief Data Monetization Officer, Date: 2023-12-23 00:00:00
Title: Creating a More Fair, Just, and Prosperous Brave New World with AI Summary, Date: 2023-12-22 00:00:00
Title: Eight Techniques for Powering ChatGPT Content, Date: 2023-12-22 00:00:00
Title: The challenges cloud migration and modernization solve for enterprises, Date: 2023-1

In [33]:
towards_df =pd.DataFrame(towards_ds_data,columns=None)

In [34]:
towards_df['date'] = pd.to_datetime(towards_df['date'])
towards_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     221 non-null    object        
 1   date      221 non-null    datetime64[ns]
 2   SubTitle  221 non-null    object        
dtypes: datetime64[ns](1), object(2)
memory usage: 5.3+ KB


In [35]:
towards_df.to_csv("towards_datascience2.csv")

In [37]:
datascience_df = pd.DataFrame(data_science_central_data, columns=None)

In [38]:
datascience_df['date'] = pd.to_datetime(datascience_df['date'])
datascience_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     26 non-null     object        
 1   date      26 non-null     datetime64[ns]
 2   SubTitle  26 non-null     object        
dtypes: datetime64[ns](1), object(2)
memory usage: 752.0+ bytes


In [39]:
datascience_df.to_csv("datascience_df2.csv")

In [8]:
levelup_url = "https://levelup.gitconnected.com/latest"
levelup_xpath_title = "//*[contains(@class, 'graf') and contains(@class, 'graf--h3') and contains(@class, 'graf-after--figure') and contains(@class, 'graf--trailing') and contains(@class, 'graf--title')]"
levelup_xpath_date = "//time[@datetime]"
levelup_xpath_subtitle= "//*[contains(@class, 'graf') and contains(@class, 'graf--h4') and contains(@class, 'graf-after--h3') and contains(@class, 'graf--trailing') and contains(@class, 'graf--subtitle')]"

In [9]:
driver = setup_driver(service,chr_options)
level_up_coding = scrape_articles(driver, levelup_url, levelup_xpath_title, levelup_xpath_subtitle,levelup_xpath_date,parse_towards_ds_date)

Title: Learning Go: Part Eight — Concurrency and Channels, Date: 2024-01-06 16:03:09.738000
Title: Navigating the Challenges of Automation Testing: A Detox Review, Date: 2024-01-06 12:27:48.452000
Title: Number Triangles (Can You Solve This in ONE Line? 1), Date: 2024-01-06 07:13:46.239000
Title: Build More Effective ChatGPT Prompts with Poptimizer, Date: 2024-01-05 23:48:06.421000
Title: Microsoft’s new Copilot app: Here are the best ways to use the ChatGPT alternative, Date: 2024-01-05 19:06:27.300000
Title: Rust Is Getting Extremely Popular. Here Are Some Important Projects Using It. (Even The Windows Kernel!), Date: 2024-01-05 16:36:12.589000
Title: 7 Advanced Programming Concepts That Confound the Novice, Date: 2024-01-05 14:33:07.021000
Title: Building Stable Diffusion from Scratch Using Python, Date: 2024-01-05 12:23:08.600000
Title: Mastering Database Division: A Comprehensive Guide to Dynamic Django Expressions and Advanced ORM Techniques, Date: 2024-01-05 11:38:26.062000
Titl

ERROR:root:An error occurred: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=120.0.6099.131)
Stacktrace:
	GetHandleVerifier [0x00007FF77F134D02+56194]
	(No symbol) [0x00007FF77F0A04B2]
	(No symbol) [0x00007FF77EF476AA]
	(No symbol) [0x00007FF77EF2FD6A]
	(No symbol) [0x00007FF77EF2FC20]
	(No symbol) [0x00007FF77EF499A1]
	(No symbol) [0x00007FF77EFD2047]
	(No symbol) [0x00007FF77EFB5C23]
	(No symbol) [0x00007FF77EF84A45]
	(No symbol) [0x00007FF77EF85AD4]
	GetHandleVerifier [0x00007FF77F4AD5BB+3695675]
	GetHandleVerifier [0x00007FF77F506197+4059159]
	GetHandleVerifier [0x00007FF77F4FDF63+4025827]
	GetHandleVerifier [0x00007FF77F1CF029+687785]
	(No symbol) [0x00007FF77F0AB508]
	(No symbol) [0x00007FF77F0A7564]
	(No symbol) [0x00007FF77F0A76E9]
	(No symbol) [0x00007FF77F098094]
	BaseThreadInitThunk [0x00007FF8ACAD257D+29]
	RtlUserThreadStart [0x00007FF8AD90AA58+40]

