# NEWS Collection through web scrapping

## Collecting NEWS

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

url = "https://archives.ndtv.com/articles/2020-11.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    news_links = soup.find_all('a')

    # Skip the first 40 links
    start_index = 50
    news_links = news_links[start_index:]

    # Limit the number of outputs
    num_outputs_to_display = 8000

    # CSV file setup
    csv_file_path = "newss_data.csv"
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['Link Number', 'News Link', 'Description', 'Headline', 'Author', 'Date', 'Paragraph']
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        csv_writer.writeheader()

        for i, link in enumerate(news_links[:num_outputs_to_display], start=start_index + 1):
            news_link = link.get('href')
            description = link.text.strip()

            # Check if the link is valid
            if is_valid_url(news_link):
                # Print the information
                print(f"Processing Link {i}: {news_link}\nDescription: {description}\n")

                # Retrieve details from the linked page
                linked_page_response = requests.get(news_link)
                if linked_page_response.status_code == 200:
                    linked_page_soup = BeautifulSoup(linked_page_response.content, 'html.parser')

                    # Extract details
                    headline_element = linked_page_soup.find('h1', itemprop='headline')
                    author_element = linked_page_soup.select_one('div.pst-by_ul span.pst-by_li span[itemprop="author"]')
                    date_element = linked_page_soup.select_one('div.pst-by_ul span.pst-by_li:not([itemprop])')

                    paragraphs = [p.text.strip() for p in linked_page_soup.find_all('p')]

                    # Check if elements are found before accessing their properties
                    headline = headline_element.text.strip() if headline_element else "N/A"
                    author = author_element.text.strip() if author_element else "N/A"
                    date = date_element.text.strip() if date_element else "N/A"

                    # Print details
                    print(f"Headline: {headline}\nAuthor: {author}\nDate: {date}\nParagraphs:\n{paragraphs}\n")

                    # Write to CSV file
                    csv_writer.writerow({
                        'Link Number': i,
                        'News Link': news_link,
                        'Description': description,
                        'Headline': headline,
                        'Author': author,
                        'Date': date,
                        'Paragraph': paragraphs
                    })

                else:
                    print(f"Failed to retrieve linked page. Status Code: {linked_page_response.status_code}\n")

            else:
                print(f"Invalid URL. Skipping link {i}\n")

    print(f"\nData has been saved in the CSV file: {csv_file_path}")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


### Preprocessing

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
from langdetect import detect

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def is_non_empty_paragraph(p):
    return p.text.strip() if p.text.strip() else None

def extract_author(author_element):
    if author_element.name == 'a':
        return author_element.text.strip()
    elif author_element.find('a'):
        # Extract text from anchor tag inside the span
        return author_element.find('a').text.strip()
    else:
        return "N/A"

url = "https://archives.ndtv.com/articles/2021-04.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    news_links = soup.find_all('a')

    # Skip the first 40 links
    start_index = 2000
    news_links = news_links[start_index:]

    # Limit the number of outputs
    num_outputs_to_display = 4000

    # Limit the number of paragraphs for description
    

    # CSV file setup for Hindi and English news
    csv_file_path_all = "news_data21_alls.csv"
    fieldnames = ['Link Number', 'News Link', 'Language', 'Description', 'Headline', 'Author', 'Date', 'Paragraph']

    with open(csv_file_path_all, 'a', newline='', encoding='utf-8') as csv_file_all:
        csv_writer_all = csv.DictWriter(csv_file_all, fieldnames=fieldnames)

        # If the file is empty, write the header
        if csv_file_all.tell() == 0:
            csv_writer_all.writeheader()

        for i, link in enumerate(news_links[:num_outputs_to_display], start=start_index + 1):
            news_link = link.get('href')
            description = link.text.strip()

            # Check if the link is valid
            if is_valid_url(news_link):
                # Print the information
                print(f"Processing Link {i}: {news_link}\nDescription: {description}\n")

                # Retrieve details from the linked page
                linked_page_response = requests.get(news_link)
                if linked_page_response.status_code == 200:
                    linked_page_soup = BeautifulSoup(linked_page_response.content, 'html.parser')

                    # Extract details
                    headline_element = linked_page_soup.find('h1', itemprop='headline')
                    author_elements = linked_page_soup.find_all('span', itemprop='author')
                    published_date = linked_page_soup.select_one('.pst-by_li:contains("Updated:")')
                    date = published_date.text.replace("Updated:", "").strip() if published_date else "N/A"

                    # Exclude paragraphs within the specified div with class "ft-social"
                    excluded_div = linked_page_soup.find('div', class_='ft-social')
                    excluded_paragraphs = set(excluded_div.find_all('p')) if excluded_div else set()

                    # Limit the number of paragraphs for description
                    description_paragraphs = [
                        is_non_empty_paragraph(p)
                        for p in linked_page_soup.find_all('p')
                        if p not in excluded_paragraphs
                    ]

                    if any(description_paragraphs):
                        headline = headline_element.text.strip() if headline_element else "N/A"
                        authors = [extract_author(author) for author in author_elements]
                        author = ', '.join(authors) if authors else "N/A"

                        # Use langdetect to identify the language
                        try:
                            language = detect(linked_page_soup.get_text())
                        except:
                            language = "unknown"

                        # Include only English and Hindi content
                        if language in ['en', 'hi']:
                            # Update the language column based on detected language
                            if language == 'en':
                                language_text = 'English'
                            else:
                                language_text = 'Hindi'

                            # Print details
                            print(f"Headline: {headline}\nAuthor: {author}\nDate: {date}\nDescription Paragraphs:\n{description_paragraphs}\nLanguage: {language_text}\n")

                            # Write to CSV file for Hindi and English news
                            csv_writer_all.writerow({
                                'Link Number': i,
                                'Language': language_text,
                                'News Link': news_link,
                                'Description': description,
                                'Headline': headline,
                                'Author': author,
                                'Date': date,
                                'Paragraph': description_paragraphs
                                
                            })
                        else:
                            print(f"Skipping Link {i} due to unsupported language: {language}\n")
                    else:
                        print(f"Skipping Link {i} due to empty description paragraphs.\n")

                else:
                    print(f"Failed to retrieve linked page. Status Code: {linked_page_response.status_code}\n")

            else:
                print(f"Invalid URL. Skipping link {i}\n")

    print(f"\nData has been appended to the CSV file: {csv_file_path_all}")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


### Remove Ads

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
from langdetect import detect

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def is_non_empty_paragraph(p):
    return p.text.strip() if p.text.strip() else None

def extract_author(author_element):
    if author_element.name == 'a':
        return author_element.text.strip()
    elif author_element.find('a'):
        # Extract text from anchor tag inside the span
        return author_element.find('a').text.strip()
    else:
        return "N/A"

url = "https://archives.ndtv.com/articles/2021-04.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    news_links = soup.find_all('a')

    # Skip the first 40 links
    start_index = 2000
    news_links = news_links[start_index:]

    # Limit the number of outputs
    num_outputs_to_display = 4000

    # CSV file setup for Hindi and English news
    csv_file_path_all = "news_data21_alls.csv"
    fieldnames = ['Link Number', 'News Link', 'Language', 'Description', 'Headline', 'Author', 'Date', 'Paragraph']

    with open(csv_file_path_all, 'a', newline='', encoding='utf-8') as csv_file_all:
        csv_writer_all = csv.DictWriter(csv_file_all, fieldnames=fieldnames)

        # If the file is empty, write the header
        if csv_file_all.tell() == 0:
            csv_writer_all.writeheader()

        for i, link in enumerate(news_links[:num_outputs_to_display], start=start_index + 1):
            news_link = link.get('href')
            description = link.text.strip()

            # Check if the link is valid
            if is_valid_url(news_link):
                # Print the information
                print(f"Processing Link {i}: {news_link}\nDescription: {description}\n")

                # Retrieve details from the linked page
                linked_page_response = requests.get(news_link)
                if linked_page_response.status_code == 200:
                    linked_page_soup = BeautifulSoup(linked_page_response.content, 'html.parser')

                    # Extract details
                    headline_element = linked_page_soup.find('h1', itemprop='headline')
                    author_elements = linked_page_soup.find_all('span', itemprop='author')
                    published_date = linked_page_soup.select_one('.pst-by_li:contains("Updated:")')
                    date = published_date.text.replace("Updated:", "").strip() if published_date else "N/A"

                    # Exclude paragraphs within the specified div with class "ft-social"
                    excluded_div = linked_page_soup.find('div', class_='ft-social')
                    excluded_paragraphs = set(excluded_div.find_all('p')) if excluded_div else set()

                    # Limit the number of paragraphs for description
                    description_paragraphs = [
                        is_non_empty_paragraph(p)
                        for p in linked_page_soup.find_all('p')
                        if p not in excluded_paragraphs
                        and "Advertisement" not in p.get_text(strip=True)  # Exclude Advertisement paragraphs
                    ]

                    if any(description_paragraphs):
                        headline = headline_element.text.strip() if headline_element else "N/A"
                        authors = [extract_author(author) for author in author_elements]
                        author = ', '.join(authors) if authors else "N/A"

                        # Use langdetect to identify the language
                        try:
                            language = detect(linked_page_soup.get_text())
                        except:
                            language = "unknown"

                        # Include only English and Hindi content
                        if language in ['en', 'hi']:
                            # Update the language column based on detected language
                            if language == 'en':
                                language_text = 'English'
                            else:
                                language_text = 'Hindi'

                            # Concatenate the description paragraphs into a single string
                            description_text = ' '.join(filter(None, description_paragraphs))

                            # Print details
                            print(f"Headline: {headline}\nAuthor: {author}\nDate: {date}\nDescription Paragraphs:\n{description_text}\nLanguage: {language_text}\n")

                            # Write to CSV file for Hindi and English news
                            csv_writer_all.writerow({
                                'Link Number': i,
                                'Language': language_text,
                                'News Link': news_link,
                                'Description': description,
                                'Headline': headline,
                                'Author': author,
                                'Date': date,
                                'Paragraph': description_text
                            })
                        else:
                            print(f"Skipping Link {i} due to unsupported language: {language}\n")
                    else:
                        print(f"Skipping Link {i} due to empty description paragraphs.\n")

                else:
                    print(f"Failed to retrieve linked page. Status Code: {linked_page_response.status_code}\n")

            else:
                print(f"Invalid URL. Skipping link {i}\n")

    print(f"\nData has been appended to the CSV file: {csv_file_path_all}")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


### Collecting news Through Selenium Driver

In [None]:

import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import re
import csv
from dateutil import parser

# Set up the ChromeOptions
# Set up the Selenium webdriver
driver = webdriver.Chrome()


# Replace 'your_website_url' with the actual URL of your website
website_url = 'https://www.pib.gov.in/indexd.aspx'
# Replace 'your_website_url' with the actual URL of your website
driver.get(website_url)

# Select Hindi language
language_dropdown = Select(driver.find_element(By.ID, 'Bar1_ddlLang'))
language_dropdown.select_by_value('2')

time.sleep(30)
press_releases_link = driver.find_element(By.CSS_SELECTOR, 'ul li a[href="/PMContents/PMContents.aspx?menuid=1&Lang=2&RegionId=3"]')
press_releases_link.click()


time.sleep(4)

# Click on the first article link
first_article_link = driver.find_element(By.CSS_SELECTOR, 'ul.num li a')
first_article_link.click()

time.sleep(8)

# Select the desired date
# Select the desired date (replace with your own logic if needed)
day_dropdown = Select(driver.find_element(By.ID, 'ContentPlaceHolder1_ddlday'))
day_dropdown.select_by_value('0')  # Replace '1' with the desired day

# List of ministry values
ministry_arr = [15, 1340, 40, 31, 53, 5, 47, 11, 1336, 21]

# Create a CSV file and write the header
csv_file_path = 'news/hindii_2023.csv'
with open(csv_file_path, 'a', newline='', encoding='utf-8-sig') as csv_file: 
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Link','Title','Language','Date', 'Content','Classifier'])
    # Iterate through each month
    for month_value in range(1, 13):
        month_dropdown = Select(driver.find_element(By.ID, 'ContentPlaceHolder1_ddlMonth'))
        month_dropdown.select_by_value(str(month_value))
        # Re-locate the year dropdown after interacting with day and month dropdowns
        year_dropdown = Select(driver.find_element(By.ID, 'ContentPlaceHolder1_ddlYear'))
        year_dropdown.select_by_value('2023')  # Replace '2021' with the desired year

        # Wait for the page to update with the selected month's news
        time.sleep(4)  # Adjust the sleep duration based on your website's loading time

        # Iterate through each ministry
        for ministry_value in ministry_arr:
            ministry_dropdown = Select(driver.find_element(By.ID, 'ContentPlaceHolder1_ddlMinistry'))
            ministry_dropdown.select_by_value(str(ministry_value))

            # Wait for the page to update with the selected ministry's news
            time.sleep(4)  # Adjust the sleep duration based on your website's loading time

            # Scraping news articles
            try:
                link_elements = driver.find_elements(By.CSS_SELECTOR, 'ul.leftul li a.listLeftrel2')
            except:
                link_elements = []

            # Iterate through each link
            for link_element in link_elements:
                title = link_element.text
                onclick_value = link_element.get_attribute('onclick').split('(')[-1].split(')')[0]


                # Build the iframe URL
                iframe_url = f'https://www.pib.gov.in/PressReleasePage.aspx?PRID={onclick_value}'
                # Open a new tab/window and switch to it
                driver.execute_script("window.open('', '_blank');")
                driver.switch_to.window(driver.window_handles[1])
                # Open the iframe URL in the new tab/window
                driver.get(iframe_url)
                time.sleep(8)
                # Find the input tag by ID
                input_tag = driver.find_element(By.ID, 'ltrDescriptionn')
                # Extract the value attribute text
                value_text = input_tag.get_attribute('value')
                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(value_text, 'html.parser')
                # Find all paragraphs in the HTML content
                target_paragraphs = soup.find_all('p', text=lambda x: x and re.search(r'\*\s*\*\s*\*', x))
                date_element = driver.find_element(By.CSS_SELECTOR, 'div.ReleaseDateSubHeaddateTime')
                # Extract the text content from the element
                publish_date_text = date_element.text
                
                # Use dateutil.parser to parse the date
                try:
                    parsed_date = parser.parse(publish_date_text, fuzzy=True)
                    formatted_date = parsed_date.strftime('%d-%m-%Y')
                except ValueError:
                    print("N/A")
                # Check if there are any target paragraphs
                if target_paragraphs:
                    # Find all previous siblings (preceding <p> tags) and concatenate their text content until the target paragraph
                    concatenated_text = ""
                    for target_paragraph in target_paragraphs:
                        current_paragraph = target_paragraph.find_previous('p')

                        while current_paragraph and not re.search(r'\*\s*\*\s*\*', current_paragraph.text):
                            concatenated_text = current_paragraph.text.strip() + " " + concatenated_text
                            current_paragraph = current_paragraph.find_previous('p')

                    csv_writer.writerow([iframe_url,title,"Hindi",formatted_date, concatenated_text.strip(),"Government"])
                    
                    #print(f"Link: {title}\n  Date :{formatted_date}\nContent:{concatenated_text.strip()}\n")
                
                # Close the current tab/window and switch back to the main window
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

# Close the browser
driver.quit()