In [1]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
import json

In [2]:
def get_soup(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    else:
        raise Exception(f"Failed to retrieve content. Status code: {response.status_code}")

In [10]:
def get_article_hrefs(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    driver.get(url)
    time.sleep(5)

    all_hrefs = set()

    while True:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        content_area = soup.find('div', class_='content-area')
        if not content_area:
            break
        
        articles = content_area.find_all('article')
        for article in articles:
            article_header = article.find('header',{'class':'entry-header'})
            article_h2 = article_header.find('h2',{'class':'entry-title'})
            a = article_h2.find('a')
            href = a.get('href')
            if href:
                all_hrefs.add(href)

        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.2);")
            time.sleep(2)
            older_posts_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//div[@class="content-area"]//main[@class="site-main"]//div[@id="infinite-handle"]//button[text()="Older posts"]'))
            )
            print(older_posts_button)
            older_posts_button.click()
            print('OlderPost Btn Clicked')
            time.sleep(5)

        except TimeoutException:
            print('TimeoutException')
            break

    driver.quit()

    return list(all_hrefs)

In [63]:
def extract_content_from_div(parent_div):
    writing = []
    links = []
    extracted_data = {}

    for child in parent_div.children:
        tag_name = child.name

        if tag_name == 'p':
            text = child.get_text(strip=True)
            if text:
                writing.append(text)
            else:
                images = child.find_all('img')
                for img in images:
                    img_src = img.get('src')
                    if img_src:
                        links.append(img_src)
        elif tag_name == 'div' and child.get('id') != "jp-post-flair":
            print('we in div')
            images = child.find_all('img')
            for img in images:
                img_src = img.get('src')
                if img_src:
                    links.append(img_src)
        elif tag_name == 'figure':
            print('we in figure')
            images = child.find_all('img')
            for img in images:
                img_src = img.get('src')
                if img_src:
                    links.append(img_src)

            iframes = child.find_all('iframe')
            for iframe in iframes:
                iframe_src = iframe.get('src')
                if iframe_src:
                    links.append(iframe_src)

        elif tag_name in ['ol', 'ul']:
            list_items = child.find_all('li')
            for item in list_items:
                item_text = item.get_text(strip=True)
                if item_text:
                    writing.append(item_text)

    extracted_data['Text'] = writing
    extracted_data['mediaLinks'] = links
    
    return extracted_data

In [None]:
dates =['2018','2019','2020','2021','2022','2023','2024']
main_url = "https://eastcroydoncool.co.uk/"
soup = get_soup(main_url)

In [None]:
data = {}
brand_div = soup.find('div',{'class':'site-branding-text'})
brand_p = brand_div.find('p',{'class':'site-title'})
data["site_url"] = brand_p.find('a')['href']
data["site_title"] = brand_p.find('a').text
data['site_description'] = brand_div.find('p',{'class':'site-description'}).text
data["AllContent"] = []

In [11]:
url = "https://eastcroydoncool.co.uk/2018"
yearly_links = get_article_hrefs(url)

<selenium.webdriver.remote.webelement.WebElement (session="8a1566caf937d5391cb5729b5d460c7f", element="f.50F9FC746066EE26CEC9E63A96287ADB.d.B21FEBCD02A2BC9A0D70F44EA4598C12.e.39")>
OlderPost Btn Clicked
TimeoutException


In [12]:
print(len(yearly_links))
print(yearly_links)

15
['https://eastcroydoncool.co.uk/2018/04/05/east-croydon-cool-talks-theatre/', 'https://eastcroydoncool.co.uk/2018/07/04/east-croydon-cool-talkship-hop/', 'https://eastcroydoncool.co.uk/2018/10/23/east-croydon-cool-talks-croydon-literary-festival/', 'https://eastcroydoncool.co.uk/2018/10/10/east-croydon-cool-talks-counselling/', 'https://eastcroydoncool.co.uk/2018/06/10/east-croydon-cool-talks-photography/', 'https://eastcroydoncool.co.uk/2018/08/24/east-croydon-cool-talks-risefestival/', 'https://eastcroydoncool.co.uk/2018/04/16/east-croydon-cool-talks-yoga/', 'https://eastcroydoncool.co.uk/2018/05/29/east-croydon-cool-talks-personal-training/', 'https://eastcroydoncool.co.uk/2018/06/10/croydoniscool-tweet-chat/', 'https://eastcroydoncool.co.uk/2018/05/11/east-croydon-cool-talks-graffiti/', 'https://eastcroydoncool.co.uk/2018/06/25/east-croydon-cool-talks-homelessness/', 'https://eastcroydoncool.co.uk/2018/08/06/east-croydon-cool-talks-jewellery-design/', 'https://eastcroydoncool.co

In [24]:
article = yearly_links[13]
print(article)
blog = {}

https://eastcroydoncool.co.uk/2018/10/06/east-croydon-cool-talks-illustration/


In [65]:
article_soup = get_soup('https://eastcroydoncool.co.uk/2019/12/18/east-croydon-cool-talks-interior-design/')
article_sec = article_soup.find('article')

In [66]:
header = article_sec.find('header',{'class':'entry-header'})
blog['Title'] = header.find('h1').text.strip()
blog['postTimeDate'] = header.find('time').text.strip()
art_div = article_sec.find('div',{'class':'entry-content'})
data = extract_content_from_div(art_div)
with open('data/test.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

In [9]:
blog['title'] = soup.find('h1').text.strip()
content_div = soup.find('div',{'class':'entry-content'})
text_ps = content_div.find_all('p')
txt = []
for text_p in text_ps:
    txt.append(text_p.text)
blog['content'] = ' '.join(txt)
print(blog['title'])
print(blog['content'])

Welcome
Croydon and Cool? Say wuuut?! It’s fair to say Croydon isn’t generally considered the epitome of cool. But like all underdogs – there’s way more to it than it’s given credit for.  This big (Brutalist) slab of concrete (surrounded by acres of countryside) has seen a lot of changes in recent years. In East Croydon, there are more entertainment options thanks to the arrival of Boxpark and the refurbishment of Fairfield Halls. There are increased education opportunities courtesy of a new London South Bank University campus and a partnership between Croydon College and the University of Roehampton. There are more enterprise possibilities due to a start-up culture fostered by the likes of Sussex Innovation Centre and Start Up Croydon. The restaurant and bar scene (celebrated at the annual Croydon Food Festival) is attracting foodies from across London and the long-awaited “Westfield” project is now underway again. All this in addition to what already made Croydon a great place to be:

In [10]:
data['blogs'].append(blog)

In [17]:
with open('data/complete.json','w') as f:
    json.dump(data, f, indent=4)