<a href="https://colab.research.google.com/github/FaisalNabil/dhaka-tribune-content-mining/blob/main/NewsScrapingDhakaTribiun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# Install Selenium
!pip install selenium

# Set up Chromedriver
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')


# Load libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
import spacy
import nltk
import json
import csv
import time
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from datetime import date, timedelta

# NLTK Downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Web Scraping

In [None]:
# Set up headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.binary_location = '/usr/bin/chromium-browser'
driver = webdriver.Chrome(options=chrome_options)

class DhakaTribuneScraper:
    def __init__(self):
        self.driver = driver
        self.articles = []

    def scrape_year_archive(self, year):
        start_date = date(year, 1, 1)
        end_date = date(year, 1, 31)  # Modify end_date for the full year if needed
        delta = timedelta(days=1)

        while start_date <= end_date:
            self.scrape_articles(start_date.strftime("%Y-%m-%d"))
            start_date += delta

    def scrape_articles(self, current_date):
        try:
          formatted_date = current_date
          self.base_url = f"https://www.dhakatribune.com/archive/{formatted_date}"
          print(f"Scraping page: {self.base_url}")
          self.driver.get(self.base_url)
        except Exception as e:
            print(f"Error scraping page: {self.base_url}")
            print(f"General error: {e}")
            return

        # Attempt to click the "More" button until it's no longer available
        while True:
            try:
                more_button = WebDriverWait(self.driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button#ajax_load_more_600_btn"))
                )
                more_button.click()
                time.sleep(2)
            except (TimeoutException, NoSuchElementException):
                break

        # Scrape the links to individual articles
        article_links = []
        article_elements = self.driver.find_elements(By.CSS_SELECTOR, "h2.title > a.link_overlay")
        for element in article_elements:
            link = element.get_attribute('href')
            if link.startswith('/'):
                link = 'https://www.dhakatribune.com' + link
            article_links.append(link)

        # Visit each article and scrape content
        for link in article_links:
            try:
                if "photo-gallery" in link or "video" in link:
                    print(f"Skipping non-article page: {link}")
                    continue

                self.driver.get(link)
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div.content_detail_inner"))
                )
                title = self.driver.find_element(By.CSS_SELECTOR, "h1[itemprop='headline']").text
                content_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.viewport > p")
                content = ' '.join([el.text for el in content_elements])
                self.articles.append({'title': title, 'content': content, 'url': link, 'date': formatted_date})

            except WebDriverException as e:
                print(f"Error occurred while scraping article: {link}")
                print(f"WebDriver error: {e}")
                # Optionally restart the WebDriver here
                return
            except NoSuchElementException as e:
                print(f"Error occurred while scraping article: {link}")
                print(f"NoSuchElement error: {e}")
                return
            except TimeoutException as e:
                print(f"Error occurred while scraping article: {link}")
                print(f"TimeoutException error: {e}")
                return

    def close_driver(self):
        self.driver.quit()

    def analyze_and_save(self, filename="dhaka_tribune_articles.csv"):
        # Load location data
        with open('places-in-bangladesh.json', 'r') as file:
            location_data = json.load(file)

        sia = SentimentIntensityAnalyzer()

        def analyze_text(text, location_data):
            tokens = word_tokenize(text)
            sentiment = sia.polarity_scores(text)
            doc = nlp(text)
            divisions, districts, subdistricts = set(), set(), set()

            for ent in doc.ents:
                if ent.label_ == "GPE":
                    for division, dists in location_data.items():
                        if ent.text in dists:
                            divisions.add(division)
                            districts.add(ent.text)
                        else:
                            for district, subs in dists.items():
                                if ent.text in subs:
                                    divisions.add(division)
                                    districts.add(district)
                                    subdistricts.add(ent.text)

            return tokens, sentiment, divisions, districts, subdistricts

        with open(filename, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Date', 'Title', 'Content' 'URL', 'Sentiment', 'Divisions', 'Districts', 'Sub-Districts'])
            for article in self.articles:
                tokens, sentiment, divs, dists, subs = analyze_text(article['content'], location_data)
                writer.writerow([article['date'], article['title'], article['content'], article['url'], sentiment, ', '.join(divs), ', '.join(dists), ', '.join(subs)])
                '''print({
                    'Title': article['title'],
                    'URL': article['url'],
                    'Date': article['date'],
                    'Sentiment': sentiment,
                    'Divisions': ', '.join(divs),
                    'Districts': ', '.join(dists),
                    'Sub-Districts': ', '.join(subs)
                })'''

# Running the Scrapper

In [None]:
scraper = DhakaTribuneScraper()
scraper.scrape_year_archive(2022)
scraper.analyze_and_save()
scraper.close_driver()