In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import csv
from time import sleep
from random import randint

# Define headers to make the request look like it's coming from a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
import requests
# Function to extract data from a single page
def extract_data_from_page(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {response.status_code}")
        return []

    html_content = response.content

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the category from the base URL
    category = re.search(r'/category/([^?]+)', url).group(1)

    # Find all article containers
    article_containers = soup.find_all('div', class_='col-md-4 padding-10')
    if not article_containers:
        print("No article containers found")
        return []

    # Initialize a list to store the extracted information
    articles = []

    # Loop through each article container and extract the required information
    for idx, container in enumerate(article_containers):
        article_info = {}

        # Extract the title and title URL
        title_tag = container.find('h3', class_='mb-15 fw-6 fz-20').find('a')
        if title_tag:
            title = title_tag.text.strip()
            title_url = "https://ujyaaloonline.com" + title_tag['href']
        else:
            print(f"Title not found in container {idx}")
            continue

        # Extract the description
        description_tag = container.find('div', class_='grid-post')
        if description_tag and description_tag.find('img'):
            description = description_tag.find('img')['alt'].strip()
        else:
            print(f"Description not found in container {idx}")
            description = ""

        # Extract the author URL and author name
        author_tag = container.find('a', href=re.compile(r'/author/'))
        if author_tag:
            author_url = "https://ujyaaloonline.com" + author_tag['href']
            author_name = author_tag.text.strip()
        else:
            print(f"Author information not found in container {idx}")
            author_url = ""
            author_name = ""

        # Extract the date
        date_tag = container.find('span', class_='date')
        if date_tag:
            date = date_tag.text.strip()
        else:
            print(f"Date not found in container {idx}")
            date = ""

        # Send a request to the title URL and get the content
        title_response = requests.get(title_url, headers=headers)
        if title_response.status_code != 200:
            print(f"Failed to retrieve the title page: {title_url}")
            content = ""
        else:
            title_html_content = title_response.content
            title_soup = BeautifulSoup(title_html_content, 'html.parser')

            # Extract all the content from the <p style="text-align: justify;"> elements
            content_paragraphs = title_soup.find_all('p', style="text-align: justify;")
            content = "\n".join([p.text.strip() for p in content_paragraphs])

        # Store the extracted information in the dictionary
        article_info['Title'] = title
        article_info['URL'] = title_url
        article_info['Description'] = description
        article_info['Author_url'] = author_url
        article_info['Author_name'] = author_name
        article_info['Date'] = date
        article_info['Category'] = category
        article_info['Content'] = content

        # Append the dictionary to the articles list
        articles.append(article_info)

    return articles

# Generate a list of page URLs to scrape
sleep(randint(2,10))
base_url = "https://ujyaaloonline.com/category/politics?page="
start_page = 0
end_page = 10

page_urls = [base_url + str(page) for page in range(start_page, end_page + 1)]

# Initialize a list to store all articles from all pages
all_articles = []

# Loop through each page URL and extract the data
for page_url in page_urls:
    articles = extract_data_from_page(page_url)
    all_articles.extend(articles)

# Define the CSV file name
csv_file = 'extracted_articles.csv'

# Define the CSV headers
csv_headers = ['URL','Title',  'Description', 'Content','Category', 'Date','Author_url', 'Author_name'  ]

# Write the extracted information to the CSV file
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_headers)
    writer.writeheader()
    writer.writerows(all_articles)

print(f"Data successfully saved to {csv_file}")
