In [None]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import re

## Categories to be crawled & scraped

In [None]:
categories = ['https://netnews.com.mt/gabra/lokali', 'https://netnews.com.mt/gabra/internazzjonali', 'https://netnews.com.mt/gabra/sport']

## Get number of pages for each category

In [None]:
class NetPageCountSpider(scrapy.Spider):
    name = 'net_page_count'
    start_urls = [f"{url}/page/2/" for url in categories]
    handle_httpstatus_list = [404]
    custom_settings = {
            'FEEDS': { 'page_counts.csv': { 'format': 'csv',}},
            'DOWNLOAD_DELAY': 1,
            'CONCURRENT_ITEMS':25,
            'CONCURRENT_REQUESTS':25,
            'CONCURRENT_REQUESTS_PER_DOMAIN': 25,
            'AUTOTHROTTLE_ENABLED':False,
            'RANDOMIZE_DOWNLOAD_DELAY': False
    }

    def parse(self, response):
        if response.status == 404:
            category = response.request.url.split('/page/')[0]
            yield {"Category": category, "Count": 1}
            
        title = response.xpath('//title/text()').get()
        match = re.search(r'of (\d+)', title)
        if match:
            category = response.request.url.split('/page/')[0]
            if any(c == category for c in categories):
                yield {"Category": category, "Count": int(match.group(1))}

In [None]:
process = CrawlerProcess()
process.crawl(NetPageCountSpider)
process.start()

## Crawl article links

In [6]:
df = pd.read_csv('page_counts.csv')
page_counts = dict(zip(df['Category'], df['Count']))

In [None]:
page_counts

In [None]:
class NetLinksSpider(scrapy.Spider):
    name = "net_links"
    start_urls = [f"{category}/page/{i}/" for category in categories for i in range(1, page_counts[category] + 1)]
    custom_settings = {
            'FEEDS': { 'net_links.csv': { 'format': 'csv',}},
            'DOWNLOAD_DELAY': 1,
            'CONCURRENT_ITEMS':25,
            'CONCURRENT_REQUESTS':25,
            'CONCURRENT_REQUESTS_PER_DOMAIN': 25,
            'AUTOTHROTTLE_ENABLED':False,
            'RANDOMIZE_DOWNLOAD_DELAY': False
    }

    def parse(self, response):
        hrefs = response.xpath('//h2[@class="entry-title h3"]/a/@href')
        
        for href in hrefs: 
            yield {"0": href.get()}


In [None]:
process = CrawlerProcess()
process.crawl(NetLinksSpider)
process.start()

## Clean articles list

In [None]:
df = pd.read_csv("net_links.csv")
print(f"Unfiltered = {len(df)} articles")

unique_links = list()
seen_elements = set()

links  = df.iloc[:, 0].tolist()

for l in links:
    if l not in seen_elements:
        unique_links.append(l)
        seen_elements.add(l)

print(f"Filtered = {len(unique_links)} articles")

## Scrape articles text

In [None]:
class NetTextSpider(scrapy.Spider):
    name = "net_text"
    start_urls = unique_links
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_ITEMS':25,
        'CONCURRENT_REQUESTS':25,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 25,
        'AUTOTHROTTLE_ENABLED':False,
        'RANDOMIZE_DOWNLOAD_DELAY': False
		}

    def parse(self, response):
        div = response.xpath('//div[@class="entry-content herald-entry-content"]')
        paragraphs = div.xpath('.//p/text()').getall()
        strings = [str(p).strip() for p in paragraphs]

        with open('net.txt', 'a') as file:
            file.write('\n'.join(strings) + '\n')
        
        current_url = response.request.url
        log = {"index": unique_links.index(current_url), "url": current_url}
        with open('log.json', 'w') as json_file:
            json.dump(log, json_file, indent=2)

In [None]:
process = CrawlerProcess()
process.crawl(NetTextSpider)
process.start()