In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import json

## Crawl links of articles

In [None]:
class OneLinksSpider(scrapy.Spider):
    name = "one_links"
    start_urls = [
        "https://one.com.mt/kategorija/ahbarijiet/lokali/",
        "https://one.com.mt/kategorija/ahbarijiet/bagit-2024/",
        "https://one.com.mt/kategorija/ahbarijiet/internazzjonali/",
        "https://one.com.mt/kategorija/ahbarijiet/unjoniewropea/",
        "https://one.com.mt/kategorija/ahbarijiet/sport/",
        "https://one.com.mt/kategorija/ahbarijiet/pharmacy/",
    ]
    custom_settings = {
		'FEEDS': { 'one_links.csv': { 'format': 'csv',}},
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_ITEMS':25,
        'CONCURRENT_REQUESTS':25,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 25,
        'AUTOTHROTTLE_ENABLED':False,
        'RANDOMIZE_DOWNLOAD_DELAY': False
		}

    def parse(self, response):
        hrefs = response.xpath('//h2[@class="penci-entry-title entry-title grid-title"]/a/@href')
        
        for href in hrefs: 
            yield {"0": href.get()}
        

        next_page = response.xpath('//a[@class="next page-numbers"]/@href').get()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page), callback=self.parse)


In [None]:
process = CrawlerProcess()
process.crawl(OneLinksSpider)
process.start()

## Clean articles list

In [None]:
df = pd.read_csv("one_links.csv")
print(f"Unfiltered = {len(df)} articles")

unique_links = list()
seen_elements = set()

links  = df.iloc[:, 0].tolist()

for l in links:
    if l not in seen_elements:
        unique_links.append(l)
        seen_elements.add(l)

print(f"Filtered = {len(unique_links)} articles")

## Scrape articles text

In [None]:
class OneTextSpider(scrapy.Spider):
    name = "one_text"
    start_urls = unique_links
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_ITEMS':25,
        'CONCURRENT_REQUESTS':25,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 25,
        'AUTOTHROTTLE_ENABLED':False,
        'RANDOMIZE_DOWNLOAD_DELAY': False
		}

    def parse(self, response):
        div = response.xpath('//div[@id="penci-post-entry-inner"]')
        paragraphs = div.xpath('.//p/text()').getall()
        strings = [str(p).strip() for p in paragraphs]

        with open('one.txt', 'a') as file:
            file.write('\n'.join(strings) + '\n')
        
        current_url = response.request.url
        log = {"index": unique_links.index(current_url), "url": current_url}
        with open('log.json', 'w') as json_file:
            json.dump(log, json_file, indent=2)

In [None]:
process = CrawlerProcess()
process.crawl(OneTextSpider)
process.start()