<a href="https://colab.research.google.com/github/DannyZid/custom-seo-crawler/blob/main/scrapy_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Scrapy in Colab
!pip install scrapy



In [2]:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from datetime import datetime
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import time
import pandas as pd

class MyCrawlSpider(CrawlSpider):
    name = 'SEOcrawlspider'
    allowed_domains = ['bergzeit.de']
    start_urls = ['https://www.bergzeit.de/']

    rules = (
        Rule(LinkExtractor(allow=None), callback='parse_item', follow=True),
    )

    custom_settings = {
        'RETRY_TIMES': 3,
        'RETRY_HTTP_CODES': [429],
        'RETRY_AFTER': 90,
        'USER_AGENT': 'SEO_BZ_Spider',
        'FEED_FORMAT': 'jsonlines',
        'FEED_URI': '/content/myproject/output.json',
        'LOG_LEVEL': 'INFO',
    }

    def parse_item(self, response):
        url = response.url
        crawldate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        title = response.css('title::text').get()
        meta_description = response.css('meta[name="description"]::attr(content)').get()
        h1_tag = response.css('h1::text').get()
        canonical_tag = response.css('link[rel="canonical"]::attr(href)').get()
        meta_robots_tag = response.css('meta[name="robots"]::attr(content)').get()
        server_response = response.status

        yield {
            'url': url,
            'crawldate': crawldate,
            'title': title,
            'meta_description': meta_description,
            'h1_tag': h1_tag,
            'canonical_tag': canonical_tag,
            'meta_robots_tag': meta_robots_tag,
            'server_response': server_response,
        }


process = CrawlerProcess()

process.crawl(MyCrawlSpider)
process.start()

# Load the results into a Pandas DataFrame
result_df = pd.read_json('/content/myproject/output.json', lines=True)

# Display the DataFrame
result_df


INFO:scrapy.utils.log:Scrapy 2.11.0 started (bot: scrapybot)
2023-12-04 10:14:12 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform Linux-5.15.120+-x86_64-with-glibc2.35
2023-12-04 10:14:12 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform Linux-5.15.120+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2023-12-04 10:14:12 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls

Unnamed: 0,url,crawldate,title,meta_description,h1_tag,canonical_tag,meta_robots_tag,server_response
0,https://www.bergzeit.de/marken/salewa/,2023-12-04 09:45:49,Salewa Online Shop,Salewa im Bergzeit Online Shop ➤ Große Auswahl...,Salewa (855 Artikel),https://www.bergzeit.de/marken/salewa/,"index,follow",200
1,https://www.bergzeit.de/damen/bekleidung/westen/,2023-12-04 09:45:55,Westen für Damen online kaufen | Bergzeit,Entdecke Westen für Damen im Bergzeit Online S...,Westen Damen (282 Artikel),https://www.bergzeit.de/damen/bekleidung/westen/,"index,follow",200
2,https://www.bergzeit.de/damen/bekleidung/funkt...,2023-12-04 09:45:59,Funktionsunterwäsche für Damen online kaufen |...,Entdecke Funktionsunterwäsche für Damen im Ber...,Funktionsunterwäsche Damen (673 Artikel),https://www.bergzeit.de/damen/bekleidung/funkt...,"index,follow",200
3,https://www.bergzeit.de/damen/bekleidung/pullo...,2023-12-04 09:46:08,"Pullover, Hoodies, Shirts für Damen online kau...","Entdecke Pullover, Hoodies, Shirts für Damen i...","Pullover, Hoodies, Shirts Damen (2271 Artikel)",https://www.bergzeit.de/damen/bekleidung/pullo...,"index,follow",200
4,https://www.bergzeit.de/damen/bekleidung/hosen/,2023-12-04 09:46:14,Hosen für Damen online kaufen | Bergzeit,Entdecke Hosen für Damen im Bergzeit Online Sh...,Hosen Damen (2281 Artikel),https://www.bergzeit.de/damen/bekleidung/hosen/,"index,follow",200
...,...,...,...,...,...,...,...,...
404,https://www.bergzeit.de/p/whistler-herren-brad...,2023-12-04 10:14:56,Whistler Herren Bradley Merino Wool Longsleeve...,Herren Bradley Merino Wool Longsleeve von Whis...,\n,https://www.bergzeit.de/p/whistler-herren-brad...,"index,follow",200
405,https://www.bergzeit.de/p/icebreaker-herren-zo...,2023-12-04 10:14:56,Icebreaker Herren ZoneKnit Insulated Hoodie Lo...,Herren ZoneKnit Insulated Hoodie Longsleeve vo...,\n,https://www.bergzeit.de/p/icebreaker-herren-zo...,"index,follow",200
406,https://www.bergzeit.de/p/falke-damen-wt-trend...,2023-12-04 10:14:56,Falke Damen WT Trend Longsleeve kaufen | Bergzeit,Damen WT Trend Longsleeve von Falke im Bergzei...,\n,https://www.bergzeit.de/p/falke-damen-wt-trend...,"index,follow",200
407,https://www.bergzeit.de/p/icebreaker-herren-12...,2023-12-04 10:14:56,Icebreaker Herren 125 ZoneKnit Crew Longsleeve...,Herren 125 ZoneKnit Crew Longsleeve von Icebre...,\n,https://www.bergzeit.de/p/icebreaker-herren-12...,"index,follow",200
