In [2]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
from langdetect import detect

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def is_non_empty_paragraph(p):
    return p.text.strip() if p.text.strip() else None

def extract_author(author_element):
    if author_element.name == 'a':
        return author_element.text.strip()
    elif author_element.find('a'):
        # Extract text from anchor tag inside the span
        return author_element.find('a').text.strip()
    else:
        return "N/A"

url = "https://archives.ndtv.com/articles/2021-04.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    news_links = soup.find_all('a')
    # Skip the first 40 links
    start_index = 2000
    news_links = news_links[start_index:]

    # Limit the number of outputs
    num_outputs_to_display = 4000

    # Limit the number of paragraphs for description
    

    # CSV file setup for Hindi and English news
    csv_file_path_all = "news_data21_alls.csv"
    fieldnames = ['Link Number', 'News Link', 'Language', 'Description', 'Headline', 'Author', 'Date', 'Paragraph']

    with open(csv_file_path_all, 'a', newline='', encoding='utf-8') as csv_file_all:
        csv_writer_all = csv.DictWriter(csv_file_all, fieldnames=fieldnames)

        # If the file is empty, write the header
        if csv_file_all.tell() == 0:
            csv_writer_all.writeheader()

        for i, link in enumerate(news_links[:num_outputs_to_display], start=start_index + 1):
            news_link = link.get('href')
            description = link.text.strip()

            # Check if the link is valid
            if is_valid_url(news_link):
                # Print the information
                print(f"Processing Link {i}: {news_link}\nDescription: {description}\n")

                # Retrieve details from the linked page
                linked_page_response = requests.get(news_link)
                if linked_page_response.status_code == 200:
                    linked_page_soup = BeautifulSoup(linked_page_response.content, 'html.parser')

                    # Extract details
                    headline_element = linked_page_soup.find('h1', itemprop='headline')
                    author_elements = linked_page_soup.find_all('span', itemprop='author')
                    published_date = linked_page_soup.select_one('.pst-by_li:contains("Updated:")')
                    date = published_date.text.replace("Updated:", "").strip() if published_date else "N/A"

                    # Exclude paragraphs within the specified div with class "ft-social"
                    excluded_div = linked_page_soup.find('div', class_='ft-social')
                    excluded_paragraphs = set(excluded_div.find_all('p')) if excluded_div else set()

                    # Limit the number of paragraphs for description
                    description_paragraphs = [
                        is_non_empty_paragraph(p)
                        for p in linked_page_soup.find_all('p')
                        if p not in excluded_paragraphs
                    ]

                    if any(description_paragraphs):
                        headline = headline_element.text.strip() if headline_element else "N/A"
                        authors = [extract_author(author) for author in author_elements]
                        author = ', '.join(authors) if authors else "N/A"

                        # Use langdetect to identify the language
                        try:
                            language = detect(linked_page_soup.get_text())
                        except:
                            language = "unknown"

                        # Include only English and Hindi content
                        if language in ['en', 'hi']:
                            # Update the language column based on detected language
                            if language == 'en':
                                language_text = 'English'
                            else:
                                language_text = 'Hindi'

                            # Print details
                            print(f"Headline: {headline}\nAuthor: {author}\nDate: {date}\nDescription Paragraphs:\n{description_paragraphs}\nLanguage: {language_text}\n")

                            # Write to CSV file for Hindi and English news
                            csv_writer_all.writerow({
                                'Link Number': i,
                                'Language': language_text,
                                'News Link': news_link,
                                'Description': description,
                                'Headline': headline,
                                'Author': author,
                                'Date': date,
                                'Paragraph': description_paragraphs
                                
                            })
                        else:
                            print(f"Skipping Link {i} due to unsupported language: {language}\n")
                    else:
                        print(f"Skipping Link {i} due to empty description paragraphs.\n")

                else:
                    print(f"Failed to retrieve linked page. Status Code: {linked_page_response.status_code}\n")

            else:
                print(f"Invalid URL. Skipping link {i}\n")

    print(f"\nData has been appended to the CSV file: {csv_file_path_all}")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


Processing Link 2001: https://www.ndtv.com/india-news/isro-scientist-nambi-narayanans-illegal-arrest-case-supreme-court-to-hear-matter-next-week-2406499
Description: ISRO Scientist's Illegal Arrest Case: Top Court To Hear Matter Next Week

Headline: ISRO Scientist's Illegal Arrest Case: Top Court To Hear Matter Next Week
Author: N/A
Date: April 05, 2021 2:07 pm IST
Description Paragraphs:
["CBI said that then top police officials in Kerala were responsible for Nambi Narayanan's illegal arrest.", 'The Supreme Court Monday said it would hear next week the 1994 espionage case relating to ISRO scientist Nambi Narayanan in which a high-level probe panel appointed in 2018 has recently filed its report.', 'The top court had on September 14, 2018 appointed the three-member panel, headed by its former judge Justice (retd) D K Jain, while directing the Kerala government to cough up Rs 50 lakh compensation for compelling Mr Narayanan to undergo "immense humiliation".', 'It had ordered setting up 

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [5]:
import pandas as pd

# Replace 'your_input_file.csv' and 'your_output_file.csv' with the actual file names
input_file = 'news_data21_alls.csv'
output_file = 'news_data21_alls1.csv'

# Try reading the CSV file with 'utf-8' encoding and errors='replace'
try:
    df = pd.read_csv(input_file, encoding='utf-8', errors='replace')
except UnicodeDecodeError:
    # If 'utf-8' encoding fails, try 'latin-1' encoding
    df = pd.read_csv(input_file, encoding='latin-1', errors='replace')

# Add a new column 'Sr. No.' with sequential numbers
df.insert(0, 'Sr. No.', range(1, 1 + len(df)))

# Save the modified DataFrame back to a new CSV file with 'utf-8-sig' encoding
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Sequential 'Sr. No.' column added and saved to {output_file}")


TypeError: read_csv() got an unexpected keyword argument 'errors'

In [1]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://ndtv.in/india/gujarat-corona-update-covid-19-havoc-in-state-313-new-cases-on-thursday-2221361"

# Fetch the HTML content from the URL
response = requests.get(url)
html_content = response.text

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract date using regular expression
date_match = re.search(r'Updated: (\d+ \w+, \d{4} \d{2}:\d{2} [APMapm]{2})', str(soup))
date = date_match.group(1) if date_match else None

# Extract author using BeautifulSoup
author_tag = soup.find('span', itemprop='name')
author = author_tag.get_text(strip=True) if author_tag else None

# Print the results
print(f"Date: {date}")
print(f"Author: {author}")


Date: 1 मई, 2020 12:17 AM
Author: None


In [4]:
import pandas as pd

# Replace 'your_input_file.csv' and 'your_output_file.csv' with the actual file names
input_file = 'news_data21_alls.csv'
output_file = 'news_data21_alls1.csv'

# Read the CSV file into a pandas DataFrame with 'utf-8-sig' encoding
df = pd.read_csv(input_file, encoding='utf-8-sig')

# Add a new column 'Sr. No.' with sequential numbers
df.insert(0, 'Sr. No.', range(1, 1 + len(df)))

# Save the modified DataFrame back to a new CSV file
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Sequential 'Sr. No.' column added and saved to {output_file}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa4 in position 76856: invalid start byte

In [8]:
import pandas as pd

# Replace 'your_input_file.csv' and 'your_output_file.csv' with the actual file names
input_file = 'news_data21_alls.csv'
output_file = 'news_data21_alls1.csv'

# Try reading the CSV file with 'utf-8' encoding and errors='replace'
try:
    with open(input_file, 'r', encoding='utf-8') as file:
        df = pd.read_csv(file)
except UnicodeDecodeError:
    # If 'utf-8' encoding fails, try 'latin-1' encoding
    with open(input_file, 'r', encoding='latin-1') as file:
        df = pd.read_csv(file)

# Add a new column 'Sr. No.' with sequential numbers
df.insert(0, 'Sr. No.', range(1, 1 + len(df)))

# Save the modified DataFrame back to a new CSV file with 'utf-8-sig' encoding
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Sequential 'Sr. No.' column added and saved to {output_file}")


  df = pd.read_csv(file)


Sequential 'Sr. No.' column added and saved to news_data21_alls1.csv


In [2]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
from langdetect import detect

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def is_non_empty_paragraph(p):
    return p.text.strip() if p.text.strip() else None

def extract_author(author_element):
    if author_element.name == 'a':
        return author_element.text.strip()
    elif author_element.find('a'):
        # Extract text from anchor tag inside the span
        return author_element.find('a').text.strip()
    else:
        return "N/A"

url = "https://archives.ndtv.com/articles/2021-04.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    news_links = soup.find_all('a')

    # Skip the first 40 links
    start_index = 2000
    news_links = news_links[start_index:]

    # Limit the number of outputs
    num_outputs_to_display = 4000

    # CSV file setup for Hindi and English news
    csv_file_path_all = "news_data21_alls.csv"
    fieldnames = ['Link Number', 'News Link', 'Language', 'Description', 'Headline', 'Author', 'Date', 'Paragraph']

    with open(csv_file_path_all, 'a', newline='', encoding='utf-8') as csv_file_all:
        csv_writer_all = csv.DictWriter(csv_file_all, fieldnames=fieldnames)

        # If the file is empty, write the header
        if csv_file_all.tell() == 0:
            csv_writer_all.writeheader()

        for i, link in enumerate(news_links[:num_outputs_to_display], start=start_index + 1):
            news_link = link.get('href')
            description = link.text.strip()

            # Check if the link is valid
            if is_valid_url(news_link):
                # Print the information
                print(f"Processing Link {i}: {news_link}\nDescription: {description}\n")

                # Retrieve details from the linked page
                linked_page_response = requests.get(news_link)
                if linked_page_response.status_code == 200:
                    linked_page_soup = BeautifulSoup(linked_page_response.content, 'html.parser')

                    # Extract details
                    headline_element = linked_page_soup.find('h1', itemprop='headline')
                    author_elements = linked_page_soup.find_all('span', itemprop='author')
                    published_date = linked_page_soup.select_one('.pst-by_li:contains("Updated:")')
                    date = published_date.text.replace("Updated:", "").strip() if published_date else "N/A"

                    # Exclude paragraphs within the specified div with class "ft-social"
                    excluded_div = linked_page_soup.find('div', class_='ft-social')
                    excluded_paragraphs = set(excluded_div.find_all('p')) if excluded_div else set()

                    # Limit the number of paragraphs for description
                    description_paragraphs = [
                        is_non_empty_paragraph(p)
                        for p in linked_page_soup.find_all('p')
                        if p not in excluded_paragraphs
                        and "Advertisement" not in p.get_text(strip=True)  # Exclude Advertisement paragraphs
                    ]

                    if any(description_paragraphs):
                        headline = headline_element.text.strip() if headline_element else "N/A"
                        authors = [extract_author(author) for author in author_elements]
                        author = ', '.join(authors) if authors else "N/A"

                        # Use langdetect to identify the language
                        try:
                            language = detect(linked_page_soup.get_text())
                        except:
                            language = "unknown"

                        # Include only English and Hindi content
                        if language in ['en', 'hi']:
                            # Update the language column based on detected language
                            if language == 'en':
                                language_text = 'English'
                            else:
                                language_text = 'Hindi'

                            # Concatenate the description paragraphs into a single string
                            description_text = ' '.join(filter(None, description_paragraphs))

                            # Print details
                            print(f"Headline: {headline}\nAuthor: {author}\nDate: {date}\nDescription Paragraphs:\n{description_text}\nLanguage: {language_text}\n")

                            # Write to CSV file for Hindi and English news
                            csv_writer_all.writerow({
                                'Link Number': i,
                                'Language': language_text,
                                'News Link': news_link,
                                'Description': description,
                                'Headline': headline,
                                'Author': author,
                                'Date': date,
                                'Paragraph': description_text
                            })
                        else:
                            print(f"Skipping Link {i} due to unsupported language: {language}\n")
                    else:
                        print(f"Skipping Link {i} due to empty description paragraphs.\n")

                else:
                    print(f"Failed to retrieve linked page. Status Code: {linked_page_response.status_code}\n")

            else:
                print(f"Invalid URL. Skipping link {i}\n")

    print(f"\nData has been appended to the CSV file: {csv_file_path_all}")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


Processing Link 2001: https://www.ndtv.com/india-news/isro-scientist-nambi-narayanans-illegal-arrest-case-supreme-court-to-hear-matter-next-week-2406499
Description: ISRO Scientist's Illegal Arrest Case: Top Court To Hear Matter Next Week

Headline: ISRO Scientist's Illegal Arrest Case: Top Court To Hear Matter Next Week
Author: N/A
Date: April 05, 2021 2:07 pm IST
Description Paragraphs:
CBI said that then top police officials in Kerala were responsible for Nambi Narayanan's illegal arrest. The Supreme Court Monday said it would hear next week the 1994 espionage case relating to ISRO scientist Nambi Narayanan in which a high-level probe panel appointed in 2018 has recently filed its report. The top court had on September 14, 2018 appointed the three-member panel, headed by its former judge Justice (retd) D K Jain, while directing the Kerala government to cough up Rs 50 lakh compensation for compelling Mr Narayanan to undergo "immense humiliation". It had ordered setting up of the comm

KeyboardInterrupt: 