In [3]:
# !pip install BeautifulSoup

In [4]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [38]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites
    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)
    
    
    def get_jang_articles(self, max_pages=7):
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://jang.com.pk/category'
        categories = ['latest-news/entertainment', 'latest-news/business', 'latest-news/sports', 'latest-news/world'] # science and technology is under magazine category and others are under latest-news ('magazine/science-and-technology)

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, 2):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('div', class_='latest_page_right').find_all('li', class_='')
                
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")
                
                success_count = 0
                
                for card in cards:
                    try:
                        # Article Title
                        a = card.find('div', class_='main-heading').find('a')
                        headline = card.find('div', class_='main-heading').find('h2').get_text(strip=True)
                        # headline = card.find('h2').get_text(strip=True)
                        # Article link
                        link = a['href']
                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")

                        # Content arranged in paras inside <div> tags
                        paras = content_soup.find('div', class_='detail_view_content').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        jang_df['id'].append(self.id)
                        jang_df['title'].append(headline)
                        jang_df['link'].append(link)
                        jang_df['gold_label'].append(category.split('/')[1].replace('science-and-technology','science-technology'))
                        jang_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")
                        
                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')
            
        return pd.DataFrame(jang_df)


In [None]:
class DunyaNewsUrduScraper:
    def __init__(self):
        self.base_url = "https://urdu.dunyanews.tv"
        self.categories = ["World", "Business", "Sports", "Entertainment", "Technology", "Cricket"]
        self.articles = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "category": [],
        }
        self.article_id = 1
        self.box_classes = ["impcatg newsBox", "col-md-6 col-sm-6 col-xs-6", "cNewsBox"]

    def get_article_links(self):
        for category in self.categories:
            section_url = f"{self.base_url}/index.php/ur/{category}"
            print(f"Scraping category: {category} -> {section_url}")

            response = requests.get(section_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            for box_class in self.box_classes:
                if box_class == "impcatg newsBox":
                    boxes = soup.find_all('div', class_=box_class)
                    print(f"Found {len(boxes)} boxes with class '{box_class}'.")
                    for box in boxes:
                        try:
                            title_tag = box.find('a')
                            if title_tag:
                                title = title_tag.get_text(strip=True)
                                link = title_tag['href']
                                full_link = f"{self.base_url}{link}"

                                content = self.get_article_content(full_link)

                                self.articles['id'].append(self.article_id)
                                self.articles['title'].append(title)
                                self.articles['link'].append(full_link)
                                self.articles['content'].append(content)
                                
                                if (category == "Cricket"):
                                    self.articles['category'].append("Sports")
                                else:
                                    self.articles['category'].append(category)

                                print(f"Scraped article {self.article_id}: {title}")
                                self.article_id += 1

                                time.sleep(1)
                        except Exception as e:
                            print(f"Error processing article in class '{box_class}': {e}")
                        
                if box_class == "col-md-6 col-sm-6 col-xs-6":
                    boxes = soup.find_all('div', class_=box_class)
                    print(f"Found {len(boxes)} boxes with class '{box_class}'.")
                    for box in boxes:
                        try:
                            title_tag = box.find('a')
                            title_tag2 = box.find('h3')
                            if title_tag:
                                title = title_tag2.get_text(strip=True)
                                link = title_tag['href']
                                full_link = f"{self.base_url}{link}"

                                content = self.get_article_content(full_link)

                                self.articles['id'].append(self.article_id)
                                self.articles['title'].append(title)
                                self.articles['link'].append(full_link)
                                self.articles['content'].append(content)
                                
                                if (category == "Cricket"):
                                    self.articles['category'].append("Sports")
                                else:
                                    self.articles['category'].append(category)

                                print(f"Scraped article {self.article_id}: {title}")
                                self.article_id += 1

                                time.sleep(1)
                        except Exception as e:
                            print(f"Error processing article in class '{box_class}': {e}")
                        
                if box_class == "cNewsBox":
                    boxes = soup.find_all('div', class_=box_class)
                    print(f"Found {len(boxes)} boxes with class '{box_class}'.")
                    for box in boxes:
                        rows = box.find_all("div", class_="col-md-8")
                        print(f"Found {len(rows)} rows in box '{box_class}'.")
                        try:
                            for row in rows:
                                try:
                                    title_tag = row.find('a')
                                    if title_tag:
                                        title = title_tag.get_text(strip=True)
                                        link = title_tag['href']
                                        full_link = f"{self.base_url}{link}"

                                        content = self.get_article_content(full_link)

                                        self.articles['id'].append(self.article_id)
                                        self.articles['title'].append(title)
                                        self.articles['link'].append(full_link)
                                        self.articles['content'].append(content)
                                        if (category == "Cricket"):
                                            self.articles['category'].append("Sports")
                                        else:
                                            self.articles['category'].append(category)

                                        print(f"Scraped article {self.article_id}: {title}")
                                        self.article_id += 1

                                        time.sleep(1)
                                except Exception as e:
                                    print(f"Error processing row: {e}")
                        except Exception as e:
                            print(f"Error processing article in class '{box_class}': {e}")

    def get_article_content(self, link):
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        content_div = soup.find('div', class_='main-news')
        if not content_div:
            return ""

        paragraphs = content_div.find_all('p')
        content = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return content

    def save_to_csv(self, filename="./Scrapped Data/dunya_urdu_articles.csv"):
        df = pd.DataFrame(self.articles)
        df.to_csv(filename, index=False, encoding="utf-8-sig")
        print(f"Articles saved to {filename}")

scraper = DunyaNewsUrduScraper()
scraper.get_article_links()
scraper.save_to_csv()

In [None]:

class ARYNewsUrduScraper:
    def __init__(self):
        self.base_url = "https://urdu.arynews.tv/category"
        self.headers = {
            'User-Agent': 'Mozilla/5.0'
        }
        # self.categories = ["World", "Business", "Sports", "Entertainment", "Technology", "Cricket"]
        self.categories = ["sports-2/", "کاروباری-خبریں/", "fun-o-sakafat/", "سائنس-اور-ٹیکنالوجی/", "international-2/"]
        self.categories_2 = {"sports-2/":"Sports", "کاروباری-خبریں/":"Business", "fun-o-sakafat/":"Entertainment", "سائنس-اور-ٹیکنالوجی/":"Technology", "international-2/":"World"}
        self.articles = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "category": [],
        }
        self.article_id = 1
        self.box_classes = ["col-md-6 col-sm-6 col-xs-6", "cNewsBox"]

    def get_article_links(self):
        for category in self.categories:
            section_url = f"{self.base_url}/{category}"
            # print(section_url)
            print(f"Scraping category: {category} -> {section_url}")

            response = requests.get(section_url, headers = self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            
            boxes = soup.find_all('div', id="tdi_84")
            # print(f"Found {len(boxes)} boxes with id tdi_84.")
            
            for box in boxes:
                try:
                    title_tag = box.find('a')
                    if title_tag:
                        title = title_tag['title']
                        full_link = title_tag['href']

                        content = self.get_article_content(full_link)

                        self.articles['id'].append(self.article_id)
                        self.articles['title'].append(title)
                        self.articles['link'].append(full_link)
                        self.articles['content'].append(content)
                        
                        if (category == "Cricket"):
                            self.articles['category'].append("Sports")
                        else:
                            self.articles['category'].append(self.categories_2[category])

                        print(f"Scraped article {self.article_id}: {title}")
                        self.article_id += 1

                        time.sleep(1)
                except Exception as e:
                    print(f"Error processing article in class': {e}")
                    
            page_number = 1
            while page_number <= 15:
                print(f"Scraping page {page_number}...")

                page_url = f"{section_url}page/{page_number}"
                
                response = requests.get(page_url, headers=self.headers)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                
                boxes = soup.find_all('div', id="tdi_85")
                # print(f"Found {len(boxes)} boxes with id tdi_85.")
                
                # tdb_module_loop td_module_wrap td-animation-stack td-cpt-post
                
                for box in boxes:
                    sub_boxes = soup.find_all('div', class_="tdb_module_loop td_module_wrap td-animation-stack td-cpt-post")
                    # print(f"Found {len(boxes)} boxes with class")
                    for sub_box in sub_boxes:
                        try:
                            title_tag = sub_box.find('a')
                            if title_tag:
                                title = title_tag['title']
                                full_link = title_tag['href']

                                content = self.get_article_content(full_link)

                                self.articles['id'].append(self.article_id)
                                self.articles['title'].append(title)
                                self.articles['link'].append(full_link)
                                self.articles['content'].append(content)
                                
                                if (category == "Cricket"):
                                    self.articles['category'].append("Sports")
                                else:
                                    self.articles['category'].append(self.categories_2[category])

                                print(f"Scraped article {self.article_id}: {title}")
                                self.article_id += 1

                                time.sleep(1)
                        except Exception as e:
                            print(f"Error processing article in class': {e}")
                            
                page_number += 1

                    
    def get_article_content(self, link):
        response = requests.get(link, headers=self.headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # content_div = soup.find('div', class_='main-news')
        content_box = soup.find('div', {'data-td-block-uid': 'tdi_102'})
        if not content_box:
            return ""

        paragraphs = content_box.find_all('p')

        content = []
        for p in paragraphs:
            if p.find('img'):
                continue

            text = p.get_text(strip=True)
            if text:
                content.append(text)

        return " ".join(content)

    def save_to_csv(self, filename="./Scrapped Data/ary_urdu_articles.csv"):
        df = pd.DataFrame(self.articles)
        df.to_csv(filename, index=False, encoding="utf-8-sig")
        print(f"Articles saved to {filename}")

scraper = ARYNewsUrduScraper()
scraper.get_article_links()
scraper.save_to_csv()

In [None]:
scraper = NewsScraper() # Dunya and Ary Scrapers were run already by a group member hence not running again.

In [44]:
express_df = scraper.get_express_articles()

Scraping page 1 of category 'saqafat'...
	--> Found 10 articles on page 1 of 'saqafat'.
	--> Failed to scrape an article on page 1 of 'saqafat': 404 Client Error: Not Found for url: https://www.express.pk/story/2732336/dosti-international-festival-starts-in-lahore
	--> Successfully scraped 9 articles from page 1 of 'saqafat'.
Scraping page 2 of category 'saqafat'...
	--> Found 10 articles on page 2 of 'saqafat'.
	--> Successfully scraped 10 articles from page 2 of 'saqafat'.
Scraping page 3 of category 'saqafat'...
	--> Found 10 articles on page 3 of 'saqafat'.
	--> Successfully scraped 10 articles from page 3 of 'saqafat'.
Scraping page 4 of category 'saqafat'...
	--> Found 10 articles on page 4 of 'saqafat'.
	--> Successfully scraped 10 articles from page 4 of 'saqafat'.
Scraping page 5 of category 'saqafat'...
	--> Found 10 articles on page 5 of 'saqafat'.
	--> Successfully scraped 10 articles from page 5 of 'saqafat'.
Scraping page 6 of category 'saqafat'...
	--> Found 10 articles 

In [41]:
jang_df = scraper.get_jang_articles()

Scraping page 1 of category 'latest-news/entertainment'...
	--> Found 99 articles on page 1 of 'latest-news/entertainment'.
	--> Successfully scraped 99 articles from page 1 of 'latest-news/entertainment'.

Scraping page 1 of category 'latest-news/business'...
	--> Found 97 articles on page 1 of 'latest-news/business'.
	--> Successfully scraped 97 articles from page 1 of 'latest-news/business'.

Scraping page 1 of category 'latest-news/sports'...
	--> Found 99 articles on page 1 of 'latest-news/sports'.
	--> Successfully scraped 99 articles from page 1 of 'latest-news/sports'.

Scraping page 1 of category 'latest-news/world'...
	--> Found 100 articles on page 1 of 'latest-news/world'.
	--> Successfully scraped 100 articles from page 1 of 'latest-news/world'.



# Output
- Save a combined csv of all 3 sites.

In [None]:
# saving jang data to a csv file
jang_df.to_csv('./Scrapped Data/jang_data.csv', index=False)

In [None]:
# saving express data to a csv file
express_df.to_csv('./Scrapped Data/express_data.csv', index=False)