<a href="https://colab.research.google.com/github/DikshantBadawadagi/100xEngineers/blob/main/InShorts-WebScrape-News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install beautifulsoup4 Flask Jinja2 lxml MarkupSafe requests urllib3 Werkzeug bs4 gunicorn flask-cors pytz


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting gunicorn
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting flask-cors
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading gunicorn-23.0.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Flask_Cors-5.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: gunicorn, bs4, flask-cors
Successfully installed bs4-0.0.2 flask-cors-5.0.0 gunicorn-23.0.0


In [None]:
import datetime
import uuid
import requests
import pytz
import csv
import pandas as pd

headers = {
    'authority': 'inshorts.com',
    'accept': '*/*',
    'accept-language': 'en-GB,en;q=0.5',
    'content-type': 'application/json',
    'referer': 'https://inshorts.com/en/read',
    'sec-ch-ua': '"Not/A)Brand";v="99", "Brave";v="115", "Chromium";v="115"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'sec-gpc': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
}

# Function to fetch news in pages
def fetch_news_page(category, page=1, max_limit=100):
    params = {
        'category': category,
        'max_limit': str(max_limit),
        'include_card_data': 'true',
        'page': str(page)
    }
    response = requests.get(f'https://inshorts.com/api/en/search/trending_topics/{category}', headers=headers, params=params)
    if response.status_code == 200:
        try:
            news_data = response.json()['data']['news_list']
            return news_data
        except Exception as e:
            print(f"Error parsing JSON: {e}")
            return []
    else:
        print(f"Error fetching page {page}: {response.status_code}")
        return []

# Function to collect news until the required number is fetched, up to 100 pages
def get_large_dataset(category, total_count=100000, max_limit=100, max_pages=100):
    news_list = []
    page = 1
    while len(news_list) < total_count and page <= max_pages:
        print(f"Fetching page {page}... (Collected: {len(news_list)} articles)")
        news_data = fetch_news_page(category, page=page, max_limit=max_limit)
        if not news_data:
            print(f"No data on page {page}, skipping...")
            continue  # Skip to the next page if no data is returned
        news_list.extend(news_data)
        page += 1

    return news_list[:total_count]  # Return only the requested number of news items

def process_news_data(news_data):
    newsDictionary = {
        'success': True,
        'category': 'business',  # Example category, modify as needed
        'data': []
    }

    for entry in news_data:
        try:
            news = entry['news_obj']
            author = news['author_name']
            title = news['title']
            imageUrl = news['image_url']
            url = news['shortened_url']
            content = news['content']
            timestamp = news['created_at'] / 1000
            dt_utc = datetime.datetime.utcfromtimestamp(timestamp)
            tz_utc = pytz.timezone('UTC')
            dt_utc = tz_utc.localize(dt_utc)
            tz_ist = pytz.timezone('Asia/Kolkata')
            dt_ist = dt_utc.astimezone(tz_ist)
            date = dt_ist.strftime('%A, %d %B, %Y')
            time = dt_ist.strftime('%I:%M %p').lower()
            readMoreUrl = news['source_url']

            newsObject = {
                'id': uuid.uuid4().hex,
                'title': title,
                'imageUrl': imageUrl,
                'url': url,
                'content': content,
                'author': author,
                'date': date,
                'time': time,
                'readMoreUrl': readMoreUrl
            }
            newsDictionary['data'].append(newsObject)
        except Exception as e:
            print(f"Error processing entry: {e}")
    return newsDictionary

# Function to filter news based on title query
def filter_news_by_title(news_data, query):
    filtered_data = [news for news in news_data if query.lower() in news['title'].lower()]
    return filtered_data

# Function to save data to a CSV file
def save_to_csv(data, filename="filtered_news_data.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# Function to convert filtered news to DataFrame
def convert_to_dataframe(filtered_data):
    df = pd.DataFrame(filtered_data)
    return df

# Example usage:
category = 'business'  # Change to any desired category
news_data = get_large_dataset(category, total_count=100000, max_pages=100)  # Fetch news up to 100 pages
processed_data = process_news_data(news_data)

# Ask the user for a title to search
user_query = input("Enter the title (e.g., 'Gautam Adani') to search for in the news: ")

# Filter the news based on the user query
filtered_news = filter_news_by_title(processed_data['data'], user_query)

# Convert filtered news to DataFrame
df_filtered_news = convert_to_dataframe(filtered_news)

# Display the dataframe (optional, for debugging)
print(df_filtered_news.head())

# Save to CSV (optional)
save_to_csv(filtered_news, filename="filtered_business_news.csv")

print("Filtered news data saved to CSV successfully!")


Fetching page 1... (Collected: 0 articles)
Fetching page 2... (Collected: 10 articles)
Fetching page 3... (Collected: 20 articles)
Fetching page 4... (Collected: 30 articles)
Fetching page 5... (Collected: 40 articles)
Fetching page 6... (Collected: 50 articles)
Fetching page 7... (Collected: 60 articles)
Fetching page 8... (Collected: 70 articles)
Fetching page 9... (Collected: 80 articles)
Fetching page 10... (Collected: 90 articles)
Fetching page 11... (Collected: 100 articles)
Fetching page 12... (Collected: 110 articles)
Fetching page 13... (Collected: 120 articles)
Fetching page 14... (Collected: 130 articles)
Fetching page 15... (Collected: 140 articles)
Fetching page 16... (Collected: 149 articles)
Fetching page 17... (Collected: 159 articles)
Fetching page 18... (Collected: 169 articles)
Fetching page 19... (Collected: 179 articles)
Fetching page 20... (Collected: 189 articles)
Fetching page 21... (Collected: 198 articles)
Fetching page 22... (Collected: 208 articles)
Fetching