In [None]:
# Install necessary libraries
!pip install selenium
!pip install requests
!pip install newspaper3k
!pip install beautifulsoup4

# Import required libraries
import os
import re
import requests
from newspaper import Article
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import time

# Set up Chrome options for running in headless mode
options = Options()
options.add_argument("--headless=new")

# Create a Chrome WebDriver instance
driver = webdriver.Chrome(options=options)
driver.maximize_window()  # Maximize the window for better visibility
driver.get("https://dataconomy.com/")  # Open the main website

# Dictionary to store blog URLs categorized by their type
blogs_URL = {}
# Dictionary to store blog titles categorized by their type
blogs_title_set = {}
# Dictionary to store all blog titles categorized by their type
blogs_title = {}

# Function to navigate to a specific category in the website
def go_to_category(driver, category_URL):
    driver.get(category_URL)

# List of subcategories to scrape blogs from
category_sub_URLs = ["news", "topics/data-science/artificial-intelligence", "topics/data-science/big-data", "topics/data-science/machine-learning", "tech-trends/blockchain-tech-trends", "cybersecurity", "topics/fintech", "gaming", "topics/internet-of-things", "startups", "industry/energy-environment", "industry/finance", "industry/healthcare", "industry/industrial-goods-and-services", "industry/marketing-sales", "industry/retail-and-consumer-industry", "industry/technology-and-it-industry", "industry/transportation-and-logistics"]

# Function to scrape blogs from the specified subcategories
def scrape_blogs(driver, category_sub_URLs, blogs_URL, blogs_title, blogs_title_set):
    for category_sub_URL in category_sub_URLs:
        go_to_category(driver, f"https://dataconomy.com/category/{category_sub_URL}/")

        # Scrape the first four blogs on the page
        for i in range(1, 5):
            blog = driver.find_element(By.CLASS_NAME, f'jeg_hero_item_{i}')
            blog_category = blog.find_element(By.CLASS_NAME, 'jeg_post_category').text
            blog_title_tag = blog.find_element(By.CLASS_NAME, 'jeg_post_title')

            # Initialize dictionaries if not present
            if blog_category not in blogs_title:
                blogs_title[blog_category] = []
                blogs_URL[blog_category] = []
                blogs_title_set[blog_category] = set()

            # Add unique blog titles and URLs to dictionaries
            if blog_title_tag.text not in blogs_title_set[blog_category]:
                blogs_title_set[blog_category].add(blog_title_tag.text)
                blogs_title[blog_category].append(blog_title_tag.text)
                blogs_URL[blog_category].append(blog_title_tag.find_element(By.TAG_NAME, 'a').get_attribute("href"))

        page_no = 1

        # Continue scraping remaining blogs using pagination
        while True:
            try:
                remaining_blogs = driver.find_elements(By.CLASS_NAME, 'jeg_pl_md_1')

                for blog in remaining_blogs:
                    blog_category = blog.find_element(By.CLASS_NAME, 'jeg_post_category').text
                    blog_title_tag = blog.find_element(By.CLASS_NAME, 'jeg_post_title')

                    # Initialize dictionaries if not present
                    if blog_category not in blogs_title:
                        blogs_title[blog_category] = []
                        blogs_URL[blog_category] = []
                        blogs_title_set[blog_category] = set()

                    # Add unique blog titles and URLs to dictionaries
                    if blog_title_tag.text not in blogs_title_set[blog_category]:
                        blogs_title_set[blog_category].add(blog_title_tag.text)
                        blogs_title[blog_category].append(blog_title_tag.text)
                        blogs_URL[blog_category].append(blog_title_tag.find_element(By.TAG_NAME, 'a').get_attribute("href"))

                print(f"Page {page_no} over...")
                page_no += 1

                # Scroll to the end of the page to load more blogs
                webdriver.ActionChains(driver).send_keys(Keys.END).perform()

                # Find and click the next page button
                next_page_button = driver.find_element(By.XPATH, "//a[@class='page_nav next']")
                webdriver.ActionChains(driver).move_to_element(next_page_button).click(next_page_button).perform()

            except NoSuchElementException:
                print(f"All Blogs from https://dataconomy.com/category/{category_sub_URL}/ Registered!\n")
                break

# Call the function to scrape blogs
scrape_blogs(driver, category_sub_URLs, blogs_URL, blogs_title, blogs_title_set)

# Print the index, category, and URL of each scraped blog
index = 1
for URL_category, urls in blogs_URL.items():
    for url in urls:
        print(index, URL_category, url)
        index += 1

# Function to save blog information to a text file
def save_to_text_file(title, link, author_name, author_URL, date, text, summary, folder_name):
    # Remove invalid characters from the title to create a valid filename for the text file
    filename = os.path.join(folder_name, re.sub(r'[\/:*?"<>|]', '', title) + '.txt')

    # Create the directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Open the file in write mode
    with open(filename, 'w', encoding='utf-8') as file:
        # Write the information to the file
        file.write(f'Title: {title}\n\n')
        file.write(f'Article Link: {link}\n\n')
        file.write(f'Author: {author_name}\n\n')
        file.write(f'Publish Date: {date}\n\n')
        file.write(f'Article:\n{text}\n\n')
        file.write(f'Summary: {summary}\n\n')
        file.write(f'Other Blogs by {author_name}: {author_URL}\n')

    print(f'Information saved to {filename}')

# Iterate through each category and blog URL to scrape and save blog information
for URL_category, url_list in blogs_URL.items():
    for url in url_list:
        time.sleep(5)  # Add a delay to avoid making too many requests in a short time

        # Make a request to the URL
        response = requests.get(url)
        html = response.text

        # Use BeautifulSoup to parse the HTML
        soup = BeautifulSoup(html, 'html.parser')

        # Find author information
        author = soup.find('div', class_='jeg_meta_author').find('a')

        # Create an Article object
        article = Article(url)
        article.set_html(html)
        article.parse()
        article.nlp()

        # Extract article title
        title = article.title

        filename = os.path.join(URL_category, re.sub(r'[\/:*?"<>|]', '', title) + '.txt')
        # Check if the file already exists
        if os.path.exists(filename):
            print(f'File {filename} already exists. Skipping...')
            continue

        # Extract information from the article
        link = article.url
        author_name = author.text
        author_URL = author.get('href')
        date = article.publish_date.date()
        text = article.text
        summary = article.summary

        # Save information to a text file in the respective folder
        save_to_text_file(title, link, author_name, author_URL, date, text, summary, URL_category)

In [None]:
# !pip install selenium
# !pip install requests
# !pip install newspaper3k
# !pip install beautifulsoup4

# import os
# import re
# import requests
# from newspaper import Article
# from bs4 import BeautifulSoup
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from selenium.common.exceptions import NoSuchElementException
# from selenium.webdriver.common.action_chains import ActionChains
# import time

# options = Options()
# options.add_argument("--headless=new")
# driver = webdriver.Chrome(options=options)
# driver.maximize_window()
# driver.get("https://dataconomy.com/")

# blogs_URL = {}
# blogs_title_set = {}
# blogs_title = {}

# def go_to_category(driver, category_URL):
    
#     driver.get(category_URL)

# category_sub_URLs = ["news", "topics/data-science/artificial-intelligence", "topics/data-science/big-data", "topics/data-science/machine-learning", "tech-trends/blockchain-tech-trends", "cybersecurity", "topics/fintech", "gaming", "topics/internet-of-things", "startups", "industry/energy-environment", "industry/finance", "industry/healthcare", "industry/industrial-goods-and-services", "industry/marketing-sales", "industry/retail-and-consumer-industry", "industry/technology-and-it-industry", "industry/transportation-and-logistics"]

# def scrape_blogs(driver, category_sub_URLs, blogs_URL, blogs_title, blogs_title_set):
    
#     # first four blogs
#     for category_sub_URL in category_sub_URLs:
#         go_to_category(driver, f"https://dataconomy.com/category/{category_sub_URL}/")

#         for i in range(1,5):

#             blog = driver.find_element(By.CLASS_NAME, f'jeg_hero_item_{i}')

#             blog_category = blog.find_element(By.CLASS_NAME, 'jeg_post_category').text
#             blog_title_tag = blog.find_element(By.CLASS_NAME, 'jeg_post_title')

#             if blog_category not in blogs_title:
#                 blogs_title[blog_category] = []
#                 blogs_URL[blog_category] = []
#                 blogs_title_set[blog_category] = set()

#             if blog_title_tag.text not in blogs_title_set[blog_category]:

#                 blogs_title_set[blog_category].add(blog_title_tag.text)
#                 blogs_title[blog_category].append(blog_title_tag.text)
#                 blogs_URL[blog_category].append(blog_title_tag.find_element(By.TAG_NAME, 'a').get_attribute("href"))

#         page_no = 1

#         while True:
#             try:
#                 remaining_blogs = driver.find_elements(By.CLASS_NAME, 'jeg_pl_md_1')

#                 for blog in remaining_blogs:

#                     blog_category = blog.find_element(By.CLASS_NAME, 'jeg_post_category').text
#                     blog_title_tag = blog.find_element(By.CLASS_NAME, 'jeg_post_title')

#                     if blog_category not in blogs_title:
#                         blogs_title[blog_category] = []
#                         blogs_URL[blog_category] = []
#                         blogs_title_set[blog_category] = set()

#                     if blog_title_tag.text not in blogs_title_set[blog_category]:

#                         blogs_title_set[blog_category].add(blog_title_tag.text)
#                         blogs_title[blog_category].append(blog_title_tag.text)
#                         blogs_URL[blog_category].append(blog_title_tag.find_element(By.TAG_NAME, 'a').get_attribute("href"))

#                 print(f"Page {page_no} over...")
#                 page_no += 1

#                 webdriver.ActionChains(driver).send_keys(Keys.END).perform()

#                 next_page_button = driver.find_element(By.XPATH, "//a[@class='page_nav next']")

#                 webdriver.ActionChains(driver).move_to_element(next_page_button).click(next_page_button).perform()

#             except NoSuchElementException:
#                 print(f"All Blogs from https://dataconomy.com/category/{category_sub_URL}/ Registered!\n")
#                 break

# scrape_blogs(driver, category_sub_URLs, blogs_URL, blogs_title, blogs_title_set)

# index = 1
# for URL_category, urls in blogs_URL.items():
#     for url in urls:
#         print(index, URL_category, url)
#         index += 1

# def save_to_text_file(title, link, author_name, author_URL, date, text, summary, folder_name):
    
#     # Remove invalid characters from the title to create a valid filename for the text file
#     filename = os.path.join(folder_name, re.sub(r'[\/:*?"<>|]', '', title) + '.txt')

#     # Create the directory if it doesn't exist
#     os.makedirs(folder_name, exist_ok=True)

#     # Open the file in write mode
#     with open(filename, 'w', encoding='utf-8') as file:
#         # Write the information to the file
#         file.write(f'Title: {title}\n\n')
#         file.write(f'Article Link: {link}\n\n')
#         file.write(f'Author: {author_name}\n\n')
#         file.write(f'Publish Date: {date}\n\n')
#         file.write(f'Article:\n{text}\n\n')
#         file.write(f'Summary: {summary}\n\n')
#         file.write(f'Other Blogs by {author_name}: {author_URL}\n')

#     print(f'Information saved to {filename}')

# for URL_category, url_list in blogs_URL.items():
#     for url in url_list:

#         time.sleep(5)

#         # Make a request to the URL
#         response = requests.get(url)
#         html = response.text

#         # Use BeautifulSoup to parse the HTML
#         soup = BeautifulSoup(html, 'html.parser')

#         # Find author information
#         author = soup.find('div', class_='jeg_meta_author').find('a')

#         # Create an Article object
#         article = Article(url)
#         article.set_html(html)
#         article.parse()
#         article.nlp()

#         # Extract information from the article
#         title = article.title

#         filename = os.path.join(URL_category, re.sub(r'[\/:*?"<>|]', '', title) + '.txt')
#         # Check if the file already exists
#         if os.path.exists(filename):
#             print(f'File {filename} already exists. Skipping...')
#             continue
        
#         # Extract information from the article
#         link = article.url
#         author_name = author.text
#         author_URL = author.get('href')
#         date = article.publish_date.date()
#         text = article.text
#         summary = article.summary

#         # Save information to a text file in the respective folder
#         save_to_text_file(title, link, author_name, author_URL, date, text, summary, URL_category)