In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import json
import os


In [None]:
#Scrapes all blog links form the "page_num":th page in the "https://futurice.com/blog?page=" and returns the urls found as a list of strings (or a empyt list)
def scrape_one_base_page_for_urls(page_num):
    urls = []
    base_url = "https://futurice.com/blog?page="
    
    r = requests.get(base_url + str(page_num))

    # Check if was able to access the internet page
    if r.status_code//100 != 2:
        print("ERROR WHILE READING WEBPAGE")
        return List()
    
    # Parse the text
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Add all the links returned by the bs4 to the list
    for item in soup.body.main.find_all("a"):
        if item.get("href"):
            urls.append(item.get("href"))
        
    return urls


# Scrapes all urls form all the pages (base page "futurice.com/blog") 
def scrape_all_main_pages_subpages(verbose = False):
    scrape_more = True # A flag, that is turned false, when detected that the index:th page is last page
    index = 1          # iteration index
    url_count= 0
    urls = []

    while scrape_more:

        urls.extend(scrape_one_base_page_for_urls(index))

        # Check the flag conditions
        if len(urls) == url_count:
            scrape_more = False

        if verbose: {print(str(index) + ": " + str(len(urls)))}

        index += 1
        url_count = len(urls)


    if verbose: {print("Finnished")}
    return urls


# Helper function that writes the urls to file
def write_to_file(urls, file_path = "./urls"):
    with open(file_path, "w", encoding='utf-8') as file:
        file.write("url\n")
        for item in urls:
            file.write(item + "\n")


def fetch_all_blog_urls_to_local():
    urls = scrape_all_main_pages_subpages() 
    write_to_file(urls)


# Reads all the urls from futurices main blog page (futurice.com/blog) and checks those against databases "incompatible_blogs.csv" 
# and "../data/blogs_with_analytics.csvlogs_with_an" files to return three values containing:
#       new_blogs                           list of strings, containing all the paths to newly added blogs post, that this database hasn't seen
#       removed_old_blogs                   list of strings, containing all the paths to blog posts, that have been added to this database, but that have been removed from the futurices page
#       removed_old_incompatible_blogs      list of strings, containing all the paths to blog posts, that were tried to add to the database, but the blog was incompatible, and now have been removed from futurices webpage
#
# All the paths to blog post are of form "blog/[blogs_own_unique_path]". This is slower implementation, that checks all the pages.
def get_blog_url_statuses():
    # Blog urls that are currently in the database
    df = pd.read_csv("../data/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True, index_col=["index"])
    saved_urls = df["url"].values

    # Blog urls that have been tried, but were incompatible and therefore not added to database
    incompatible_blogs = pd.read_csv("./incompatible_blogs.csv", index_col=0)
    incompatible_urls = [re.sub("\S*futurice.com/", "", word) for word in incompatible_blogs["urls"]]

    # Blog urls currently in the futurice webpages
    urls = [word[1:] for word in scrape_all_main_pages_subpages()]

    new_blogs = set(urls) - set(saved_urls) - set(incompatible_urls)
    removed_old_blogs = set(saved_urls) - set(urls)
    removed_old_incompatible_blogs = set(incompatible_blogs) - set(urls)

    return new_blogs, removed_old_blogs, removed_old_incompatible_blogs


# Returns the most recent newly added blogposts.
# Faster implementation than "get_blog_url_statuses", but not as robust. 
def get_most_recent_newly_added_blogs():
    scrape_more = True # A flag
    index = 1          # iteration index
    ret_urls = []      # urls, of new blogs (that are to be returned from the function call)

    # Blog urls that are currently in the database
    df = pd.read_csv("../data/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True, index_col=["index"])
    saved_urls = df["url"].values

    while scrape_more:

        temp_urls = scrape_one_base_page_for_urls(index)  #Temporally saved urls, that were fetched from the futurices index:th blog base page

        if len(temp_urls) == 0:
            scrape_more = False
            break


        for url in temp_urls:
            if url[1:] in saved_urls:
                scrape_more = False
                break
            ret_urls.append(url[1:])

        index += 1

    return ret_urls

Everything after this is debugging

In [None]:
get_most_recent_newly_added_blogs()

In [None]:
new, old, removed = get_blog_url_statuses()
new