In [None]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import urllib3
from time import sleep # Import the sleep function
from random import randint # Import the randint function
from datetime import datetime, timedelta

# Mount Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')

# List of categories to scrape
categories = ["business", "opinion", "sports", "national", "entertainment", "feature", "world","blog","koseli","diaspora","Education"]  # Add more categories as needed

# Get today's date and calculate dates for the past 2 days
today = datetime.now()
dates = [today - timedelta(days=i) for i in range(2)]  # Adjust the number of days as needed

# Initialize an empty list to store all the news data
all_news_data = []

# Loop through each category
sleep(randint(2,10))
for category in categories:
    # Loop through each date
    for date in dates:
        # Format the date as YYYY/MM/DD
        formatted_date = date.strftime("%Y/%m/%d")  # Adjust the date format if needed

        # Construct the URL for the news page
        sleep(randint(2,10))
        url = f"https://ekantipur.com/{category}/{formatted_date}"

        # Initialize HTTP connection pool
        http = urllib3.PoolManager()
        http.addheaders = [('User-agent', 'Mozilla/61.0')]

        # Fetch the web page content
        sleep(randint(2,10))
        web_page = http.request('GET', url)
        soup = BS(web_page.data, 'html5lib')

        # Loop through all the divs with '.normal` class found in the webpage
        for row in soup.select(".normal"):
            # title is of h2 element
            title = row.find("h2")

            # extract the href attribute of a of a title i.e. URL of the link
            title_link = url.split(f"/{category}")[0] + title.a.get("href")

            # description is on p element
            description = row.find("p").text

            # get title text
            title_text = title.text

            # Fetch the news page content
            sleep(randint(2,10))
            news_page = http.request('GET', title_link)
            news_soup = BS(news_page.data, 'html5lib')

            # find the date and time
            date_element = news_soup.select_one("div.time-card") # Select the span with class "normal"
            if date_element:
                date = date_element.text.strip()  # Extract text and remove extra spaces
            else:
                date = None

            # Check if author element exists before accessing its attributes
            sleep(randint(2,10))
            author_element = news_soup.select_one(".author")
            if author_element:
                # find the author URL and author name
                author_url = author_element.a.get("href")
                author_name = author_element.text
            else:
                author_url = None
                author_name = None

            # find the news content
            news_content = ""
            for content in news_soup.select_one("div.description").findAll("p"):
                news_content += content.text.strip() + " "

            content = news_content.strip()

            # Store the extracted data in a dictionary
            news_item = {
                "Title": title_text,
                "URL": title_link,
                "Date": date,
                "Author": author_name,
                "Category": category,
                "Author URL": author_url,
                "Description": description,
                "Content": content
            }

            # Append the news item to the list
            all_news_data.append(news_item)

# Create a Pandas DataFrame from the collected data
df = pd.DataFrame(all_news_data)




# Save the DataFrame to a CSV file in your Google Drive
df.to_csv('/content/drive/My Drive/kantipurdataset.csv', index=False) # Change 'My Drive' to your Drive folder name if needed
print("CSV file saved to Google Drive!")


