In [27]:
#!pip install BeautifulSoup

In [28]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [29]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites
        
    def get_jang_articles(self, max_pages=1):
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }

        base_url = "https://jang.com.pk"
        categories = ['entertainment', 'business', 'sports', 'health-science', 'world']

        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/category/latest-news/{category}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                articles = soup.find_all('li')
                #print(articles)
                success_count = 0

                for article in articles:
                    try:
                        main_heading = article.find('div', class_='main-heading')
                        title_tag = main_heading.find('h2') if main_heading else None
                        link_tag = main_heading.find('a', href=True) if main_heading else None
                        link = link_tag['href'] if link_tag else None
                        title = title_tag.text.strip() if title_tag else "No title"
                        if not link or not title:
                            continue  
                        print(f"Found article {link} with title: {title}")

                        if link:
                            article_response = requests.get(link)
                            article_response.raise_for_status()
                            content_soup = BeautifulSoup(article_response.text, "html.parser")
                            content_div = content_soup.find('div', class_='detail_view_content')

                            if not content_div:
                                print("Main content not found")
                                return None

                            content_paragraphs = []
                            for p in content_div.find_all('p'):
                                text = p.get_text(strip=True)
                                if text: 
                                    content_paragraphs.append(text)

                            final_content = " ".join(content_paragraphs)
                            #print(final_content)

                            jang_df['id'].append(self.id)
                            jang_df['title'].append(title)
                            jang_df['link'].append(link)
                            jang_df['gold_label'].append(category.replace('health-science','science-technology'))
                            jang_df['content'].append(final_content)

                            self.id += 1
                            success_count += 1
                            #print("Article done++")
                    except Exception as e:
                        print("Article fail")
        return pd.DataFrame(jang_df)

            
    def get_geo_articles(self, max_pages=1):
        geo_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }

        base_url = "https://urdu.geo.tv"
        categories = ['entertainment', 'business', 'sports', 'science-technology', 'world']

        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/category/{category}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                #print(soup)

                articles = soup.find_all('li', class_='border-box')
                #print(articles)
                success_count = 0

                for article in articles:
                    try:
                        title_tag = article.find('h2')
                        link_tag = article.find('a', href=True)
                        link = link_tag['href'] if link_tag else None
                        title = title_tag.text.strip() if title_tag else "No title"
                        print(f"Article with link: {link} and title: {title}")

                        if link:
                            article_response = requests.get(link)
                            article_response.raise_for_status()
                            content_soup = BeautifulSoup(article_response.text, "html.parser")
                            #print(content_soup)

                            content_div = content_soup.find('div', class_='content-area')
                            if content_div:
                                for unwanted in content_div.find_all(['div', 'figure', 'script', 'style', 'iframe', 'ul', 'ol'], recursive=True):
                                    unwanted.decompose()

                                paras = content_div.find_all(['p', 'b'])
                                combined_text = " ".join(
                                    p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                                    for p in paras if p.get_text(strip=True)
                                )
                            else:
                                combined_text = "Content not found."
                            #print(combined_text)

                            geo_df['id'].append(self.id)
                            geo_df['title'].append(title)
                            geo_df['link'].append(link)
                            geo_df['gold_label'].append(category)
                            geo_df['content'].append(combined_text)

                            # Increment ID and success count
                            self.id += 1
                            success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")
        return pd.DataFrame(geo_df)


    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)

In [30]:
scraper = NewsScraper()

In [31]:
express_df = scraper.get_express_articles()
geo_df = scraper.get_geo_articles()
jang_df = scraper.get_jang_articles()

Scraping page 1 of category 'saqafat'...
	--> Found 10 articles on page 1 of 'saqafat'.
	--> Successfully scraped 10 articles from page 1 of 'saqafat'.
Scraping page 2 of category 'saqafat'...
	--> Found 10 articles on page 2 of 'saqafat'.
	--> Successfully scraped 10 articles from page 2 of 'saqafat'.
Scraping page 3 of category 'saqafat'...
	--> Found 10 articles on page 3 of 'saqafat'.
	--> Successfully scraped 10 articles from page 3 of 'saqafat'.
Scraping page 4 of category 'saqafat'...
	--> Found 10 articles on page 4 of 'saqafat'.
	--> Successfully scraped 10 articles from page 4 of 'saqafat'.
Scraping page 5 of category 'saqafat'...
	--> Found 10 articles on page 5 of 'saqafat'.
	--> Successfully scraped 10 articles from page 5 of 'saqafat'.
Scraping page 6 of category 'saqafat'...
	--> Found 10 articles on page 6 of 'saqafat'.
	--> Successfully scraped 10 articles from page 6 of 'saqafat'.
Scraping page 7 of category 'saqafat'...
	--> Found 10 articles on page 7 of 'saqafat'.


# Output
- Save a combined csv of all 3 sites.

In [None]:
print(express_df.head)
print(geo_df.head)
print(jang_df)

consolidated_df = pd.concat([express_df, geo_df, jang_df], ignore_index=True)
consolidated_df.to_csv('consolidated_data.csv', index=False)