Main scrapy function

In [2]:
from datetime import datetime
from typing import Iterable
import scrapy
from scrapy_selenium import SeleniumRequest

# Output file formatting
class itemFormat(scrapy.Item):
    
    card_name = scrapy.Field()
    percentage_in_decks = scrapy.Field()
    img = scrapy.Field()
    specials = scrapy.Field()
    price_per_kg = scrapy.Field()
    
    
class edhrecSpider(scrapy.Spider):
    name = 'searcher'
    
    custom_settings = {
        'FEED_FORMAT': 'json',
        'FEED_URI': f'edhrecScrape_{datetime.now().strftime("%Y%m%d%H%M%S")}.json'
    }
    
    def start_requests(self) -> Iterable[scrapy.Request]:
        # lits all urls that the spider will be crawling through
        urls = [ 'https://edhrec.com/commanders/gimli-mournful-avenger' ] # change this link to check through any commanders you want to
        
        # loop through all the links listed within the urls variable
        for url in urls:
            yield SeleniumRequest(url = url, callback = self.parse)
        return super().start_requests()

    def parse(self, response):
        
        for i, quote in enumerate(response.css('div.CardView_cardWrapper__DVSFy')):
            quote_item = itemFormat()
            
            # Extracting product information
            quote_item['card_name'] = quote.css('span.Card_name__Mpa7S::text').get(i).replace("\u00f3","o")
            quote_item['percentage_in_decks'] = quote.css('div.CardLabel_label__iAM7T::text').get().replace("\n", " || synergy ")
            # quote_item['img'] = quote.css('a img.CardImage_border__OcVcj.shadow::attr(src)').get()
            
            yield quote_item

Run the spider to scrape the site


In [None]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
    
process.crawl(edhrecSpider)
process.start()

Show final product list

In [None]:
import pandas as pd
dfjson = pd.read_json('edhrecScrape.json')
dfjson

Format the data

In [5]:
import json

def sort_json_file(file_path):
    # Read the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Sort items based on a specific attribute, such as 'product_name'
    sorted_data = sorted(data, key=lambda x: x['card_name'])

    # Write sorted contents back to the JSON file
    with open(file_path, 'w') as file:
        json.dump(sorted_data, file, indent=4)

# Example usage:
sort_json_file('edhrecScrape.json')


Only get cards that show up from the user specified percentage and above

In [6]:
whatPercentage = int(input("what percentage of cards do you want to see?"))

In [None]:
import pandas as pd
import re

# Read data from the JSON file into a pandas DataFrame
df = pd.read_json('edhrecScrape.json')

# Filter the DataFrame based on the percentage condition
filtered_df = df[df['percentage_in_decks'].str.extract(r'(\d+)%', expand=False).astype(float) >= whatPercentage]

# Output the filtered DataFrame
filtered_df.to_json('filtered_data.json', orient='records')
# print(filtered_df.to_dict(orient='records'))

filtered_search = pd.read_json('filtered_data.json')
filtered_search
