# Scraping Training Deck Lists

In [3]:
import requests
import re
import time # needed to wait between requests
import random # needed to wait random time between requests
from bs4 import BeautifulSoup

In [2]:
# send an http request to mtgtop8.com for their modern page. f=MO specifies the modern format and meta=44 is all decks
modern_front_page = requests.get('http://mtgtop8.com/format?f=MO&meta=44')

modern_front_page.status_code

200

In [3]:
# put the html into a BeautifulSoup object to get anchors for each of the 'Last 10 Events'
modern_front_page_soup = BeautifulSoup(modern_front_page.content, 'html.parser')

event_list = [event['href'] for event in modern_front_page_soup.find_all('table')[2]
                  .find('td').find_next_sibling().find_all('table')[1].find_all('a')]

event_list

['event?e=18048&f=MO',
 'event?e=18038&f=MO',
 'event?e=18037&f=MO',
 'event?e=18029&f=MO',
 'event?e=18013&f=MO',
 'event?e=18012&f=MO',
 'event?e=17981&f=MO',
 'event?e=18011&f=MO',
 'event?e=18008&f=MO',
 'event?e=18010&f=MO']

In [4]:
# use map() and regular expressions to get the event id out of the anchor tags
event_ids = list(map(lambda x: re.search("e=(\d+)&", x).group(1), event_list))

event_ids

['18048',
 '18038',
 '18037',
 '18029',
 '18013',
 '18012',
 '17981',
 '18011',
 '18008',
 '18010']

In [6]:
# send a get request for each event_id
# modern events is a list of the html for each of the event pages
modern_events = []
for event_id in event_ids:
    modern_event = requests.get('http://mtgtop8.com/event?e={}'.format(event_id))
    modern_events.append(modern_event)
    print(modern_event.status_code)

200
200
200
200
200
200
200
200
200
200


In [9]:
# create a BeautifulSoup object for each of the event ids
# modern_event_soups is a list of BeautifulSoup objects for each of the event pages
modern_event_soups = []
for modern_event in modern_events:
    modern_event_soups.append(BeautifulSoup(modern_event.content, 'html.parser'))

In [18]:
# Get to the table that contains the links to all the deck lists
# deck_list_tables is a list of the tables containing the deck lists for each event page
deck_list_tables = []

for modern_event_soup in modern_event_soups:
    deck_list_table = modern_event_soup.find_all('table')[3].find_all('a')
    deck_list_tables.append(deck_list_table)


# deck_list_table = modern_event_soups.find_all('table')[3].find_all('a')

deck_list_tables[0]

[<a href="" target="_blank"></a>,
 <a class="topic" href="?e=18048&amp;d=311953&amp;f=MO">Dsg</a>,
 <a class="topic" href="search?player=Maxime+Valentin">Maxime Valentin</a>,
 <a href="?e=18048&amp;d=311954&amp;f=MO">Gw Elves</a>,
 <a class="player" href="search?player=Marc+Chassagne">Marc Chassagne</a>,
 <a href="?e=18048&amp;d=311952&amp;f=MO">Creature Toolbox</a>,
 <a class="player" href="search?player=Lo%EFc+Grenier">Loïc Grenier</a>,
 <a href="?e=18048&amp;d=311951&amp;f=MO">Jeskai Control</a>,
 <a class="player" href="search?player=Aur%E9lien+Leroy">Aurélien Leroy</a>,
 <a href="mtgo?d=311953&amp;f=Modern_Dsg_by_Maxime_Valentin">MTGO</a>,
 <a href="dec?d=311953&amp;f=Modern_Dsg_by_Maxime_Valentin">.dec</a>,
 <a href="archetype?a=636">Death's Shadow decks</a>,
 <a href="tcg_redir?f=MO&amp;c=2 Blood Crypt||4 Bloodstained Mire||2 Ceremonious Rejection||2 Collective Brutality||4 Death's Shadow||1 Disdainful Stroke||2 Dismember||3 Fatal Push||2 Gurmag Angler||2 Inquisition of Kozilek|

In [19]:
for item in deck_list_tables[0]:
    print(item)

<a href="" target="_blank"></a>
<a class="topic" href="?e=18048&amp;d=311953&amp;f=MO">Dsg</a>
<a class="topic" href="search?player=Maxime+Valentin">Maxime Valentin</a>
<a href="?e=18048&amp;d=311954&amp;f=MO">Gw Elves</a>
<a class="player" href="search?player=Marc+Chassagne">Marc Chassagne</a>
<a href="?e=18048&amp;d=311952&amp;f=MO">Creature Toolbox</a>
<a class="player" href="search?player=Lo%EFc+Grenier">Loïc Grenier</a>
<a href="?e=18048&amp;d=311951&amp;f=MO">Jeskai Control</a>
<a class="player" href="search?player=Aur%E9lien+Leroy">Aurélien Leroy</a>
<a href="mtgo?d=311953&amp;f=Modern_Dsg_by_Maxime_Valentin">MTGO</a>
<a href="dec?d=311953&amp;f=Modern_Dsg_by_Maxime_Valentin">.dec</a>
<a href="archetype?a=636">Death's Shadow decks</a>
<a href="tcg_redir?f=MO&amp;c=2 Blood Crypt||4 Bloodstained Mire||2 Ceremonious Rejection||2 Collective Brutality||4 Death's Shadow||1 Disdainful Stroke||2 Dismember||3 Fatal Push||2 Gurmag Angler||2 Inquisition of Kozilek||1 Island||1 Izzet Static

In [63]:
# deck_list_ids is a list of all the unique ids for the deck lists
deck_list_ids = [] 

# loop through each table of deck lists for each even tpage
for deck_list_table in deck_list_tables:
    # loop through each of the anchor tags in each of the deck list tables
    for anchor in deck_list_table:
        # find only anchors where 'e=' and 'd=' are in the href. This is only the actual deck lists
        if ('d=' in anchor['href'] and 'e=' in anchor['href']):
            # add the id to the list
            deck_list_ids.append(re.search('d=(\d+)&', anchor['href']).group(1)) 

In [None]:
prop_name['strong']

In [64]:
len(deck_list_ids)

55

In [76]:
# mtg_deck_lists is a dictionary with key=deck_list_id and value is a deck list
mtg_deck_lists = {}

# loop through each of the deck ids
for deck_list_id in deck_list_ids:
    # send a get request for each of the deck lists in mtgo format
    mtg_deck_list = requests.get('http://mtgtop8.com/mtgo?d={}'.format(deck_list_id), headers={'User-Agent': 'Getting some deck lists'})
    # add the id and deck lists to the dictionary
    mtg_deck_lists[deck_list_id] =  mtg_deck_list.content.decode().split('\r\n')
    
    # wait 2 seconds between each requests, hoping not to get blocked
    time.sleep(2 + random.random())

In [77]:
mtg_deck_lists

{'311539': ['4 Bloodghast',
  '2 Golgari Thug',
  '1 Haunted Dead',
  '4 Insolent Neonate',
  '4 Narcomoeba',
  '4 Prized Amalgam',
  '4 Stinkweed Imp',
  '4 Cathartic Reunion',
  '3 Conflagrate',
  '4 Faithless Looting',
  '3 Life from the Loam',
  '2 Darkblast',
  '3 Arid Mesa',
  '2 Blackcleave Cliffs',
  '1 Blood Crypt',
  '2 Bloodstained Mire',
  '3 Copperline Gorge',
  '2 Dakmor Salvage',
  '2 Gemstone Mine',
  '2 Mountain',
  '2 Stomping Ground',
  '2 Wooded Foothills',
  'Sideboard',
  '1 Darkblast',
  '2 Abrupt Decay',
  '2 Ancient Grudge',
  '2 Collective Brutality',
  '4 Leyline of the Void',
  '2 Lightning Axe',
  '2 Thoughtseize',
  ''],
 '311540': ['2 Karn Liberated',
  '3 Endbringer',
  '4 Matter Reshaper',
  '4 Reality Smasher',
  '4 Thought-Knot Seer',
  '4 Walking Ballista',
  '2 All Is Dust',
  '2 Dismember',
  '2 Warping Wail',
  '4 Chalice of the Void',
  '4 Expedition Map',
  '1 Relic of Progenitus',
  '2 Cavern of Souls',
  '4 Eldrazi Temple',
  '3 Ghost Quarter'

# Functionalizing the scraping process

In [85]:
def get_event_ids(front_page):
    """Takes in the front page of mtgtop8.com and returns a list of all the event ids
    
    INPUT:
        - front_page: BeautifulSoup object of the event page
        
    OUTPUT:
        - event_ids: List of all the event ids from the 'Last 10 Events' table"""

    event_list = [event['href'] for event in front_page.find_all('table')[2]
                  .find('td').find_next_sibling().find_all('table')[1].find_all('a')]

    # use map() and regular expressions to get the event id out of the anchor tags
    event_ids = list(map(lambda x: re.search("e=(\d+)&", x).group(1), event_list))

    return event_ids

In [82]:
response = requests.get('http://mtgtop8.com/format?f=MO&meta=44')
print(response.status_code)

front_page = BeautifulSoup(response.content, 'html.parser')

200


In [83]:
get_event_ids(front_page)

['18048',
 '18038',
 '18037',
 '18029',
 '18013',
 '18012',
 '17981',
 '18011',
 '18008',
 '18010']

In [93]:
def get_deck_ids(event_page):
    """Takes in an event page and returns a list of all the deck ids on that page.
    
    INPUT:
        - event_page: BeautifulSoup objet of the event page from mtgtop8.com
        
    OUTPUT:
        - deck_ids: List of all deck ids on the page"""
    
    # Get to the table that contains the links to all the deck lists
    deck_list_table = event_page.find_all('table')[3].find_all('a')


    # deck_list_ids is a list of all the unique ids for the deck lists
    deck_ids = [] 

    # loop through each of the anchor tags in each of the deck list tables
    for anchor in deck_list_table:
        # find only anchors where 'e=' and 'd=' are in the href. This is only the actual deck lists
        if ('d=' in anchor['href'] and 'e=' in anchor['href']):
            # add the id to the list
            deck_ids.append(re.search('d=(\d+)&', anchor['href']).group(1)) 
                
    return deck_ids

In [87]:
response = requests.get('http://mtgtop8.com/event?e=18048&f=MO')
print(response.status_code)

event_page = BeautifulSoup(response.content, 'html.parser')

200


In [94]:
get_deck_ids(event_page)

['311953', '311954', '311952', '311951']

In [95]:
def deck_request(deck_id):
    """Takes a deck_id and returns the response from the request. Errors will come later.
    
    INPUT:
        - deck_id: the unique id for the desired deck list from mtgtop8.com
        
    OUTPUT:
        - response: the response from the get request"""
    
    response = requests.get('http://mtgtop8.com/mtgo?d={}'.format(deck_id), 
                            headers={'User-Agent': 'Getting some deck lists'})
    
    # FUTURE WORK: check the status code
    return response

In [108]:
deck_request(311953).text.split('\r\n')

['2 Watery Grave',
 '1 Serum Visions',
 '4 Thought Scour',
 '1 Island',
 '4 Street Wraith',
 '4 Snapcaster Mage',
 '1 Swamp',
 "4 Death's Shadow",
 '4 Opt',
 '3 Fatal Push',
 '1 Temur Battle Rage',
 '4 Thoughtseize',
 '2 Tasigur, the Golden Fang',
 '2 Gurmag Angler',
 '1 Steam Vents',
 "2 Kolaghan's Command",
 '2 Blood Crypt',
 '2 Dismember',
 '3 Stubborn Denial',
 '4 Polluted Delta',
 '4 Bloodstained Mire',
 '2 Inquisition of Kozilek',
 '3 Scalding Tarn',
 'Sideboard',
 '1 Izzet Staticaster',
 '2 Young Pyromancer',
 '2 Liliana of the Veil',
 '1 Liliana, the Last Hope',
 '1 Temur Battle Rage',
 '1 Pyroclasm',
 '2 Ceremonious Rejection',
 "1 Kozilek's Return",
 '1 Disdainful Stroke',
 '2 Collective Brutality',
 '1 Stubborn Denial',
 '']

In [109]:
def event_request(event_id):
    """Takes an event_id and returns the response from the request. Errors will come later.
    
    INPUT:
        - event_id: the unique id for the desired event from mtgtop8.com
        
    OUTPUT:
        - response: the response from the get request"""
    
    response = requests.get('http://mtgtop8.com/event?e={}'.format(event_id), 
                            headers={'User-Agent': 'Getting some event info'})
    
    # FUTURE WORK: check the status code
    return response

In [115]:
get_deck_ids(BeautifulSoup(event_request(18048).text, 'html.parser'))

['311953', '311954', '311952', '311951']

In [117]:
def modern_front_page_request(page_number=0):
    """Sends a get request to mtgtop8.com with the given page number
    
    INPUT: 
        - page_number: argument for the get request. page_number of 3 gets decks from 21-30.
                       Default value of 0 to get decks from 1-10.
    
    OUTPUT:
        - response: the response from the get request"""
    
    response = requests.get('http://mtgtop8.com/format?f=MO&meta=44&cp={}'.format(page_number),
                            headers={'User-Agent': 'Modern front page request'})
    
    # FUTURE WORK: check the status code
    return response

In [119]:
front_page = BeautifulSoup(modern_front_page_request().text, 'html.parser')

In [120]:
get_event_ids(front_page)

['18048',
 '18038',
 '18037',
 '18029',
 '18013',
 '18012',
 '17981',
 '18011',
 '18008',
 '18010']

In [121]:
def event_requests(event_ids):
    """Takes a list of event ids and returns a list of responses from the requests. Errors will come later.
    
    INPUT:
        - event_ids: the list of unique ids for the desired events from mtgtop8.com
        
    OUTPUT:
        - responses: the list of responses from the get requests"""
    
    responses = []
    for event_ids in event_ids:
        response = event_request(event_id)
        responses.append(response)
    
    # FUTURE WORK: check the status code
    return responses

In [123]:
event_ids = get_event_ids(front_page)

event_ids

['18048',
 '18038',
 '18037',
 '18029',
 '18013',
 '18012',
 '17981',
 '18011',
 '18008',
 '18010']

In [125]:
event_requests(event_ids)

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

In [126]:
def deck_requests(deck_ids):
    """Takes a list of deck ids and returns the responses from the requests. Errors will come later.
    
    INPUT:
        - deck_ids: the unique ids for each ofthe desired deck lists from mtgtop8.com
        
    OUTPUT:
        - responses: the responses from the get requests"""
    
    responses = []
    for deck_id in deck_ids:
        response = deck_request(deck_id)
        responses.append(response)
    
    # FUTURE WORK: check the status code
    return responses

In [128]:
deck_ids = get_deck_ids(BeautifulSoup(event_request(18048).text, 'html.parser'))

deck_ids

['311953', '311954', '311952', '311951']

In [129]:
deck_requests(deck_ids)

[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>]

# Getting all Modern Legal Cards

In [5]:
# used to read all the modern cards from 
import json

In [179]:
modern_legal_cards = requests.get('https://api.deckbrew.com/mtg/cards?format=modern&page=112')

modern_legal_cards.status_code

200

In [180]:
modern_json = json.loads(modern_legal_cards.text)

modern_json

[{'cmc': 1,
  'colors': ['green'],
  'cost': '{G}',
  'editions': [{'artist': 'Tomasz Jedruszek',
    'flavor': 'On Mirrodin, every conflict ends in either death or darksteel.',
    'html_url': 'https://deckbrew.com/mtg/cards/209035',
    'image_url': 'https://image.deckbrew.com/mtg/multiverseid/209035.jpg',
    'layout': 'normal',
    'multiverse_id': 209035,
    'number': '134',
    'price': {'high': 0, 'low': 0, 'median': 0},
    'rarity': 'common',
    'set': 'Scars of Mirrodin',
    'set_id': 'SOM',
    'set_url': 'https://api.deckbrew.com/mtg/sets/SOM',
    'store_url': 'http://store.tcgplayer.com/magic/scars-of-mirrodin/withstand-death?partner=DECKBREW',
    'url': 'https://api.deckbrew.com/mtg/cards?multiverseid=209035',
    'watermark': 'Mirran'}],
  'formats': {'commander': 'legal',
   'legacy': 'legal',
   'modern': 'legal',
   'vintage': 'legal'},
  'id': 'withstand-death',
  'name': 'Withstand Death',
  'store_url': 'http://store.tcgplayer.com/magic/scars-of-mirrodin/withs

## DeckBrew API
The DeckBrew api works, but it won't be supported after May 2018. They recommended Scryfall.

In [238]:
page_number = 0
modern_legal_cards = []
while True:
    response = requests.get('https://api.deckbrew.com/mtg/cards?format=modern&page={}'.format(page_number))
    
    if response.status_code != 200:
        print('Bad status code: {}'.format(response.status_code))
        break
    cards = json.loads(response.text)
    
    if cards == []:
        print('Empty page on page #{}'.format(page_number))
        break
        
    modern_legal_cards.append(cards)
    page_number += 1
    break

# Scryfall API
Scryfall seems to work very well. [scryfall api](https://scryfall.com/docs/api)

In [4]:
response = requests.get('https://api.scryfall.com/cards/search?q=f:modern', 
                        headers={'User-Agent': 'Getting modern legal cards'})

response.status_code

200

In [6]:
response_json = json.loads(response.text)

In [7]:
total_pages = 0
url = 'https://api.scryfall.com/cards/search?q=format:modern'
modern_legal_cards_scryfall = []
while True:
    response = requests.get(url, headers={'User-Agent': 'Getting modern legal cards'})
    
    if response.status_code != 200:
        print('Bad Status code on page #{}: {}'.format(response.status_code))
        break
        
    cards = json.loads(response.text)
    
    if not cards['has_more']:
        print('All done! {} pages recieved.'.format(total_pages))
        break
    
    modern_legal_cards_scryfall.append(cards['data'])
    total_pages += 1
    url = cards['next_page']
    
    time.sleep(0.05 + random.random())
    
    print('Page request #{} successful! Onto the next.'.format(total_pages))

Page request #1 successful! Onto the next.
Page request #2 successful! Onto the next.
Page request #3 successful! Onto the next.
Page request #4 successful! Onto the next.
Page request #5 successful! Onto the next.
Page request #6 successful! Onto the next.
Page request #7 successful! Onto the next.
Page request #8 successful! Onto the next.
Page request #9 successful! Onto the next.
Page request #10 successful! Onto the next.
Page request #11 successful! Onto the next.
Page request #12 successful! Onto the next.
Page request #13 successful! Onto the next.
Page request #14 successful! Onto the next.
Page request #15 successful! Onto the next.
Page request #16 successful! Onto the next.
Page request #17 successful! Onto the next.
Page request #18 successful! Onto the next.
Page request #19 successful! Onto the next.
Page request #20 successful! Onto the next.
Page request #21 successful! Onto the next.
Page request #22 successful! Onto the next.
Page request #23 successful! Onto the nex

In [8]:
card_dict = {}
for page in modern_legal_cards_scryfall:
    for card in page:
        # 'id' is scryfalls unique ID
        card_dict[card['name']] = card['id']

In [14]:
card_dict['Lightning Bolt']

'ccee0b4c-0cb0-4c0f-8ddc-bc74b8b97273'

# CSV Format

I don't like the CSV format. It gives very little information that would be really useful for the future.

In [211]:
response = requests.get('https://api.scryfall.com/cards/search?q=format:modern&format=csv')

response.status_code

200

In [218]:
response.text

'multiverse_id,mtgo_id,set,collector_number,rarity,name,mana_cost,cmc,type_line,artist,usd,eur,tix,image_uri,scryfall_uri\n430847,64840,HOU,158,R,Abandoned Sarcophagus,{3},3.0,Artifact,Daarken,0.25,0.19,0.1,https://img.scryfall.com/cards/large/en/hou/158.jpg?1508728404,https://scryfall.com/card/hou/158\n414412,61408,EMN,115,U,Abandon Reason,{2}{R},3.0,Instant,Josh Hass,0.18,0.02,0.01,https://img.scryfall.com/cards/large/en/emn/115.jpg?1509843896,https://scryfall.com/card/emn/115\n409626,,DDQ,50,U,Abattoir Ghoul,{3}{B},4.0,Creature â\x80\x94 Zombie,Volkan Baga,0.16,0.04,,https://img.scryfall.com/cards/large/en/ddq/50.jpg?1509842399,https://scryfall.com/card/ddq/50\n235595,42612,ISD,1,C,Abbey Griffin,{3}{W},4.0,Creature â\x80\x94 Griffin,Jaime Jones,0.12,0.03,0.02,https://img.scryfall.com/cards/large/en/isd/1.jpg?1512049208,https://scryfall.com/card/isd/1\n398411,58042,ORI,127,R,Abbot of Keral Keep,{1}{R},2.0,Creature â\x80\x94 Human Monk,Deruchenko Alexander,0.69,0.95,0.12,https://img.s

In [213]:
from io import StringIO

In [219]:
TESTDATA=StringIO(response.text)

df = pd.read_csv(TESTDATA, sep=",")

In [223]:
df.set_index(keys='multiverse_id', inplace=True)

In [236]:
df

Unnamed: 0_level_0,mtgo_id,set,collector_number,rarity,name,mana_cost,cmc,type_line,artist,usd,eur,tix,image_uri,scryfall_uri
multiverse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
430847,64840.0,HOU,158,R,Abandoned Sarcophagus,{3},3.0,Artifact,Daarken,0.25,0.19,0.10,https://img.scryfall.com/cards/large/en/hou/15...,https://scryfall.com/card/hou/158
414412,61408.0,EMN,115,U,Abandon Reason,{2}{R},3.0,Instant,Josh Hass,0.18,0.02,0.01,https://img.scryfall.com/cards/large/en/emn/11...,https://scryfall.com/card/emn/115
409626,,DDQ,50,U,Abattoir Ghoul,{3}{B},4.0,Creature â Zombie,Volkan Baga,0.16,0.04,,https://img.scryfall.com/cards/large/en/ddq/50...,https://scryfall.com/card/ddq/50
235595,42612.0,ISD,1,C,Abbey Griffin,{3}{W},4.0,Creature â Griffin,Jaime Jones,0.12,0.03,0.02,https://img.scryfall.com/cards/large/en/isd/1....,https://scryfall.com/card/isd/1
398411,58042.0,ORI,127,R,Abbot of Keral Keep,{1}{R},2.0,Creature â Human Monk,Deruchenko Alexander,0.69,0.95,0.12,https://img.scryfall.com/cards/large/en/ori/12...,https://scryfall.com/card/ori/127
409790,59886.0,SOI,49,U,Aberrant Researcher // Perfected Form,{3}{U},4.0,Creature â Human Insect // Creature â Inse...,Nils Hamm,0.20,0.04,0.01,https://img.scryfall.com/cards/large/en/soi/49...,https://scryfall.com/card/soi/49
373661,50400.0,THS,75,R,Abhorrent Overlord,{5}{B}{B},7.0,Creature â Demon,Slawomir Maniak,0.25,0.15,0.01,https://img.scryfall.com/cards/large/en/ths/75...,https://scryfall.com/card/ths/75
386463,54416.0,KTK,159,C,Abomination of Gudul,{3}{B}{G}{U},6.0,Creature â Horror,Erica Yang,0.12,0.02,0.01,https://img.scryfall.com/cards/large/en/ktk/15...,https://scryfall.com/card/ktk/159
430772,64650.0,HOU,83,U,Abrade,{1}{R},2.0,Instant,Jonas De Ro,1.39,1.16,0.96,https://img.scryfall.com/cards/large/en/hou/83...,https://scryfall.com/card/hou/83
425971,63379.0,MM3,146,R,Abrupt Decay,{B}{G},2.0,Instant,Svetlin Velinov,3.64,4.40,3.08,https://img.scryfall.com/cards/large/en/mm3/14...,https://scryfall.com/card/mm3/146


In [None]:
response.