<a href="https://colab.research.google.com/github/DannyZid/custom-seo-crawler/blob/main/scrapy_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Scrapy in Colab
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.0-py2.py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.4/286.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Twisted<23.8.0,>=18.9.0 (from scrapy)
  Downloading Twisted-22.10.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.1.0-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.8.1-py2.py3-none-any.whl (17 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-23.1.0-py3-none-any.whl (12 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.1.

In [4]:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from datetime import datetime
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import time
import pandas as pd

class MyCrawlSpider(CrawlSpider):
    name = 'SEOcrawlspider'
    allowed_domains = ['soulbuddylook.com']
    start_urls = ['https://www.soulbuddylook.com/']

    rules = (
        Rule(LinkExtractor(allow=None), callback='parse_item', follow=True),
    )

    custom_settings = {
        'DOWNLOAD_DELAY': 5,
        'RETRY_TIMES': 3,
        'RETRY_HTTP_CODES': [429],
        'RETRY_AFTER': 90,
        'USER_AGENT': 'SEO_BZ_Spider',
        'FEED_FORMAT': 'jsonlines',
        'FEED_URI': '/content/myproject/output.json',
        'LOG_LEVEL': 'INFO',
    }

    def parse_item(self, response):
        url = response.url
        crawldate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        title = response.css('title::text').get()
        meta_description = response.css('meta[name="description"]::attr(content)').get()
        h1_tag = response.css('h1::text').get()
        canonical_tag = response.css('link[rel="canonical"]::attr(href)').get()
        meta_robots_tag = response.css('meta[name="robots"]::attr(content)').get()
        server_response = response.status

        yield {
            'url': url,
            'crawldate': crawldate,
            'title': title,
            'meta_description': meta_description,
            'h1_tag': h1_tag,
            'canonical_tag': canonical_tag,
            'meta_robots_tag': meta_robots_tag,
            'server_response': server_response,
        }


process = CrawlerProcess()

process.crawl(MyCrawlSpider)
process.start()

# Load the results into a Pandas DataFrame
result_df = pd.read_json('/content/myproject/output.json', lines=True)

# Display the DataFrame
result_df


INFO:scrapy.utils.log:Scrapy 2.11.0 started (bot: scrapybot)
2023-11-30 10:08:13 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Linux-5.15.120+-x86_64-with-glibc2.35
2023-11-30 10:08:13 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Linux-5.15.120+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2023-11-30 10:08:13 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls

Unnamed: 0,url,crawldate,title,meta_description,h1_tag,canonical_tag,meta_robots_tag,server_response
0,https://www.soulbuddylook.com/,2023-11-30 10:08:16,Partner Caps und Beanies als ideales Geschenk ...,Jetzt Partner Caps als besonderes Geschenk onl...,PARTNER CAPS & BEANIES– ERLEBT GEMEINSAM EURE ...,https://www.soulbuddylook.com/,"index,follow",200
1,https://www.soulbuddylook.com/Partnerlook/Part...,2023-11-30 10:08:23,Partnerlook für Mutter & Sohn » passend zu jed...,Coole Partnerlooks für Mütter & Söhne » Ob als...,Partnerlook für Mutter und Sohn,https://www.soulbuddylook.com/Partnerlook/Part...,"index,follow",200
2,https://www.soulbuddylook.com/Partnerlook/Part...,2023-11-30 10:08:23,Partnerlook für Vater & Sohn » Zeigt eure Verb...,Definiert euren gemeinsamen Style mit dem Part...,Partnerlook für Vater & Sohn,https://www.soulbuddylook.com/Partnerlook/Part...,"index,follow",200
3,https://www.soulbuddylook.com/cap-organizer/ac...,2023-11-30 10:08:23,Ordne deine Lieblings-Caps mit dem Cap Organizer,Du sammelst Caps und suchst nach einer Aufbewa...,\n Cap Organizer\n ...,https://www.soulbuddylook.com/cap-organizer/ac...,"index,follow",200
4,https://www.soulbuddylook.com/geschenkgutschei...,2023-11-30 10:08:23,Online Gutschein von Soulbuddy zum selbst ausd...,Du möchtest deinen Liebsten ein tolles Geschen...,\n Geschenkgutschein - ...,https://www.soulbuddylook.com/geschenkgutschei...,"index,follow",200
...,...,...,...,...,...,...,...,...
326,https://www.soulbuddylook.com/kids-trucker-cap...,2023-11-30 10:13:33,Trucker Cap für Kinder | Soulbuddy,Stylische Trucker Caps für Kinder & Babys ☞ al...,\n Kids Trucker Cap - S...,https://www.soulbuddylook.com/kids-trucker-cap...,"index,follow",200
327,https://www.soulbuddylook.com/kids-trucker-cap...,2023-11-30 10:13:34,Trucker Cap für Kinder | Soulbuddy,Stylische Trucker Caps für Kinder & Babys ☞ al...,\n Kids Trucker Cap - R...,https://www.soulbuddylook.com/kids-trucker-cap...,"index,follow",200
328,https://www.soulbuddylook.com/blog/mutter-toch...,2023-11-30 10:13:36,Mutter Tochter Outfit,Inspirationen für das perfekte Partnerlook Mut...,\n MUTTER TOCHTER O...,https://www.soulbuddylook.com/blog/mutter-toch...,"index,follow",200
329,https://www.soulbuddylook.com/blog/erster-vate...,2023-11-30 10:13:36,Erster Vatertag - Papa Baby Geschenk,Es ist der erste Vatertag für eure kleine Fami...,\n ERSTER VATERTAG ...,https://www.soulbuddylook.com/blog/erster-vate...,"index,follow",200
