In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_page(page_number):
    url = f"https://mefthe.com/ads/page/{page_number}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    listings = soup.find_all('article', class_='listing-item')
    for listing in listings:
        title = listing.find('h2', class_='h4 entry-title').find('a').get_text()
        price_element = listing.find('span', class_='post-price')
        price = price_element.get_text().replace("Price ", "") if price_element else "Not provided"
        date = listing.find('li', class_='listing-date fa-icon fa-clock-o').get_text()
        location = listing.find('data-address')

        if location:
            location = location.get('data-address').strip()
        else:
            location = ''
        
        print(f'Title: {title}')
        print(f'Price: {price}')
        print(f'Date: {date}')
        print(f'Location: {location}')
        print('-----')

for page in range(1, 6):  # change to the number of pages you want to scrape
    print(f'Scraping page {page}...')
    scrape_page(page)


Scraping page 1...
Title: Studio Basement for Rent – የሚከራይ ቤት
Price: $950.00
Date:  June 14, 2023
Location: 
-----
Title: Habesha Roommate Needed
Price: $650.00
Date:  June 3, 2023
Location: 
-----
Title: Newly Renovated Basement bedroom  for Rent – የሚከራይ ቤዝመንት
Price: Not provided
Date:  June 1, 2023
Location: 
-----
Title: New Basement for Rent – የሚከራይ ቤዝመንት
Price: $1,250.00
Date:  May 22, 2023
Location: 
-----
Title: One Bedroom Full Basement for Rent – ባለ አንድ መኝታ ሁሉንም ያሟላ የሚከራይ
Price: Not provided
Date:  May 22, 2023
Location: 
-----
Title: Fully furnished Room with attached private bathroom
Price: Not provided
Date:  April 26, 2023
Location: 
-----
Title: New Build Room for Rent -የሚከራይ ክፍል
Price: $945.00
Date:  April 19, 2023
Location: 
-----
Title: Full basement for Rent – የሚከራይ ቤዝመንት
Price: Not provided
Date:  April 17, 2023
Location: 
-----
Title: Room in a Basement for Rent – የሚከራይ ቤት
Price: Not provided
Date:  April 17, 2023
Location: 
-----
Title: 1 Room for Rent – Roommate N

In [5]:
 pip install scrapy


Collecting scrapy
  Downloading Scrapy-2.9.0-py2.py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.2/277.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting Twisted>=18.9.0
  Downloading Twisted-22.10.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cssselect>=0.9.1
  Using cached cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1
  Downloading itemloaders-1.1.0-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0
  Downloading parsel-1.8.1-py2.py3-none-any.whl (17 kB)
Collecting pyOpenSSL>=21.0.0
  Downloading pyOpenSSL-23.2.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting queuelib>=1.4.2
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecti

In [8]:
import scrapy
from scrapy.crawler import CrawlerProcess

class MeftheSpider(scrapy.Spider):
    name = 'mefthe_spider'
    start_urls = ['https://mefthe.com/ads/']

    def parse(self, response):
        AD_SELECTOR = '.listing-item'
        for ad in response.css(AD_SELECTOR):
            NAME_SELECTOR = 'h2 a ::text'
            PRICE_SELECTOR = 'span.post-price ::text'
            DATE_SELECTOR = 'li.listing-date ::text'
            yield {
                'name': ad.css(NAME_SELECTOR).extract_first(),
                'price': ad.css(PRICE_SELECTOR).extract_first(),
                'date': ad.css(DATE_SELECTOR).extract_first(),
            }

        NEXT_PAGE_SELECTOR = '.nav-links a.next ::attr(href)'
        next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
        if next_page:
            yield scrapy.Request(
                response.urljoin(next_page),
                callback=self.parse
            )

process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'mefthe_ads.json'
})

process.crawl(MeftheSpider)
process.start()


2023-06-16 12:26:53 [scrapy.utils.log] INFO: Scrapy 2.9.0 started (bot: scrapybot)
2023-06-16 12:26:53 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.13, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.11.1 (main, Jan 23 2023, 16:57:30) [Clang 14.0.0 (clang-1400.0.29.202)], pyOpenSSL 23.2.0 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.2, Platform macOS-14.0-arm64-arm-64bit
2023-06-16 12:26:53 [scrapy.crawler] INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-06-16 12:26:53 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-06-16 12:26:53 [scrapy.extensions.telnet] INFO: Telnet Password: 261ae0f7bfe5cfad
  exporter = cls(crawler)

2023-06-16 12:26:53 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsol

In [10]:
import scrapy
from scrapy.crawler import CrawlerProcess

class MeftheSpider(scrapy.Spider):
    name = 'mefthe_spider'
    start_urls = ['https://mefthe.com/ads/']

    def parse(self, response):
        AD_SELECTOR = '.listing-item'
        for ad in response.css(AD_SELECTOR):
            LINK_SELECTOR = 'h2 a ::attr(href)'
            yield response.follow(ad.css(LINK_SELECTOR).extract_first(), self.parse_ad)

        NEXT_PAGE_SELECTOR = '.nav-links a.next ::attr(href)'
        next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
        if next_page:
            yield scrapy.Request(
                response.urljoin(next_page),
                callback=self.parse
            )

    def parse_ad(self, response):
        NAME_SELECTOR = 'h2.entry-title ::text'
        PRICE_SELECTOR = 'span.post-price ::text'
        DATE_SELECTOR = 'li.listing-date ::text'
        DESCRIPTION_SELECTOR = 'div.post-content ::text'
        yield {
            'name': response.css(NAME_SELECTOR).extract_first(),
            'price': response.css(PRICE_SELECTOR).extract_first(),
            'date': response.css(DATE_SELECTOR).extract_first(),
            'description': response.css(DESCRIPTION_SELECTOR).extract_first(),
            'url': response.url,
        }

process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'mefthe_ads.json'
})

process.crawl(MeftheSpider)
process.start()


2023-06-16 12:30:11 [scrapy.utils.log] INFO: Scrapy 2.9.0 started (bot: scrapybot)
2023-06-16 12:30:11 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.13, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.11.1 (main, Jan 23 2023, 16:57:30) [Clang 14.0.0 (clang-1400.0.29.202)], pyOpenSSL 23.2.0 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.2, Platform macOS-14.0-arm64-arm-64bit
2023-06-16 12:30:11 [scrapy.crawler] INFO: Overridden settings:
{}
2023-06-16 12:30:11 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-06-16 12:30:11 [scrapy.extensions.telnet] INFO: Telnet Password: 8c66fea7266621c5
2023-06-16 12:30:11 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2023-06-16 12:30:11 [scrapy.middleware] INFO: 

ReactorNotRestartable: 