# Scrape HLTV events

In [90]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import re

## Start a session
The session needs some cookies combined with a user agent to bypass the captcha.

In [91]:
s = requests.Session()
s.headers.update({'cookie': '__cfduid=db221c51a4d25bdcc5841cba7831c7b0b1550063953; _ga=GA1.2.1561812873.1550063955; _gid=GA1.2.1646402451.1551795847; _gat=1',
                 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'})

## Initiate the Dataframes

I am going to use the following dataframes:
 - eventsdf: To save all the data about the events.
  - event: Name of the event
  - code: Unique code of the event
  - prizepool: The prizes of the event
  - date: The date of the event
  - location: The location of the event, and also the type (LAN/online)
  - nteams: The amount of teams that participated in the event
 - teamineventdf: To see which team attended which events.
  - team: Name of the team
  - teamcode: Unique code of the team
  - eventcode: Unique code of the event
  - placement: Placement of the team at the event
 - resultsdf: To save the results of all maps.
  - matchcode: The Unique code of the match
  - map: The map that the match was played on
  - team1 & team2: The teams participating in the map
  - score1 & score2: The score of the map
  - eventcode: The unique code of the event of the match
  - stars: The number of stars that was given to the match on HLTV
  - date: The date of the match
  - mapnumber: Indicated the sequence of the map in a match
 - picksdf and bansdf: To save all the vetos.
  - matchcode: The unique number of the match
  - team: The name of the team that made the veto
  - map: The name of the map that was veto'd
  - priority: The sequencenumber of the veto

In [89]:
eventsdf = pd.DataFrame(columns=['event', 'code', 'prizepool', 'date', 'location', 'nteams'])
teamineventdf = pd.DataFrame(columns=['team', 'teamcode', 'eventcode', 'placement'])

resultsdf = pd.DataFrame(columns=['matchcode', 'map', 'team1', 'team2', 'score1', 'score2', 'eventcode', 'stars', 'date', 'mapnumber'])
picksdf = pd.DataFrame(columns=['matchcode', 'team', 'map', 'priority'])
bansdf = pd.DataFrame(columns=['matchcode', 'team', 'map', 'priority'])
patternBan = re.compile("removed")
patternPick = re.compile("picked")

In [85]:
print(len(eventsdf))

4


## Filtering the events
I am not going to scrape all the matches of all events. Only the matches of the important events. An event is important if an important team participated in it. The teamlist is a list of teams that are in the top 30, or were in the top 30 at 26/03/2018, 25/12/2017, or 26/06/2017. These dates were chosen at random.

In [38]:
teamlist = []
urls = ["https://www.hltv.org/ranking/teams/2019/march/4", "https://www.hltv.org/ranking/teams/2018/march/26", "https://www.hltv.org/ranking/teams/2017/december/25","https://www.hltv.org/ranking/teams/2017/june/26"]
for url in urls:
    print(url)
    soup = BeautifulSoup(s.get(url).content, 'lxml')
    teams = soup.find_all('div', {"class":"ranked-team standard-box"})
    for team in teams:
        teamcode = team.find('span',{"class":"team-logo"}).find('img')['src'].split('/')[-1]
        if teamcode not in teamlist:
            print(teamcode)
            teamlist.append(teamcode)

https://www.hltv.org/ranking/teams/2019/march/4
6665
5973
4608
4869
9215
6667
6211
4411
5752
9565
6673
7533
8120
4991
4494
5995
7801
7532
5310
8481
8297
9183
4863
8135
7606
7175
5005
8008
8637
8513
https://www.hltv.org/ranking/teams/2018/march/26
6137
5929
5378
6651
7367
8068
6094
6902
6947
7020
5906
7354
5991
https://www.hltv.org/ranking/teams/2017/december/25
6615
7557
6290
8474
https://www.hltv.org/ranking/teams/2017/june/26
7010
5974
5395
6010
5422
7613
7397


## The code for the scraping

In [92]:
totalresults = 0
totalvetos = 0
totalrequests =0
def scrapeResults(eventcode):
    time.sleep(5)
    global resultsdf
    global picksdf
    global bansdf
    global patternBan
    global patternPick
    global totalresults
    global totalrequests
    global totalvetos
    resulturl = "https://www.hltv.org/results?event="+str(eventcode)
    resultSoup = BeautifulSoup(s.get(resulturl).content, 'lxml')
    totalrequests = totalrequests +1
    results = resultSoup.find_all("div", {"class":"result-con"})
    nresults = 0
    nvetos = 0
    for result in results:
        url = "https://www.hltv.org" + result.find("a", {"class":"a-reset"})['href']
        team1Name = result.find("div", {"class":"team1"}).text.strip()
        team2Name = result.find("div", {"class":"team2"}).text.strip()     
        matchCode = result.find("a", {"class":"a-reset"})['href'].split('/')[2]
        stars = str(len(result.find_all("i", {"class":"fa fa-star star"})))
        innersoup = BeautifulSoup(s.get(url).content, "lxml")
        totalrequests = totalrequests +1
        maps = innersoup.find_all("div", {"class": "mapholder"})
        date = innersoup.find("div",{"class":"date"}).text.strip()
        mapnumber = 1
        for result in maps:
            mapname = result.find("div", {'class':'mapname'}).text.strip()
            results = result.find("div", {'class': 'results'})
            if results is not None:
                score1 = results.find_all("span")[0].text.strip()
                score2 = results.find_all("span")[2].text.strip()
                nresults = nresults + 1
                totalresults = totalresults+1
                resultsdf.loc[len(resultsdf)] = [matchCode, mapname, team1Name, team2Name, score1, score2, eventcode, stars, date, mapnumber ]
                mapnumber = mapnumber+1
        try:
            vetos = innersoup.find_all("div",{'class':'standard-box veto-box'})[1].find('div').find_all('div')
            priority = 0
            for veto in vetos:
                nvetos = nvetos+1
                totalvetos = totalvetos + 1
                priority = priority + 1
                string = veto.text.strip()
                if patternBan.search(string):
                    splitstring = string.split(' removed ')
                    bansdf.loc[len(bansdf)] = [matchCode, splitstring[0][3:], splitstring[-1], priority]
                elif patternPick.search(string):
                    splitstring = string.split(' picked ')
                    picksdf.loc[len(picksdf)] = [matchCode, splitstring[0][3:], splitstring[-1], priority]
        except IndexError as e:
            1+1
            
    print("results for event: "+ str(eventcode) + " scraped, maps scraped: "+str(nresults)+", vetos scraped: "+str(nvetos) )
    print("total results: " + str(totalresults) + "|| total vetos: "+str(totalvetos))

def scrapeEvent(eventurl):
    global eventsdf
    global teamineventdf
    global teamlist
    global totalrequests
    scrapeEventResults = False
    eventsoup = BeautifulSoup(s.get(eventurl).content, "lxml")
    totalrequests = totalrequests+1
    eventname = eventurl.split("/")[-1]
    eventcode = eventurl.split("/")[-2]
    prizepool = eventsoup.find("td", {"class":"prizepool text-ellipsis"}).text.strip()
    date = eventsoup.find("td", {"class":"eventdate"}).find("span").text.strip()
    location = eventsoup.find("td", {"class":"location gtSmartphone-only"}).find("span", {"class":"text-ellipsis"}).text.strip()
    nteams = eventsoup.find("td", {"class":"teamsNumber"}).text.strip()
    eventsdf.loc[len(eventsdf)] = [eventname, eventcode, prizepool, date, location, nteams]
    teamsInEvent = eventsoup.find_all("div",{"class":"placement"})
    for team in teamsInEvent:
        try:
            teamstring = team.find("div",{"class":"team"}).find("a")['href'].split('/')
            teamname = teamstring[-1]
            teamcode = teamstring[-2]
            if teamcode in teamlist:
                scrapeEventResults = True
            placement = team.find_all("div")[1].text.strip()
            teamineventdf.loc[len(teamineventdf)] = [teamname, teamcode, eventcode, placement]
        except TypeError as e:
            1+1
    if scrapeEventResults:
        scrapeResults(eventcode)
    else:
        print("event with eventcode: "+str(eventcode)+" not scraped")    

        
def scrapePage(url):
    global totalrequests
    print("scraping page: "+url)
    soup = BeautifulSoup(s.get(url).content, "lxml")
    totalrequests = totalrequests +1
    sublists = soup.find_all("div", {"class": "events-month"})
    for sublist in sublists:
        events = sublist.find_all("a", {"class": "a-reset small-event standard-box"})
        for event in events:
            url = "https://www.hltv.org" + event['href']
            scrapeEvent(url)
            time.sleep(1)
    print("total requests so far: "+str(totalrequests))
            
def scrape():
    for i in range(30, 40):
        offset = i*50
        url = "https://www.hltv.org/events/archive?offset="+str(offset)
        scrapePage(url)
        save()
        
def save():
    startsave = int(round(time.time() * 1000))
    global eventsdf
    global teamineventdf
    global resultsdf
    global picksdf
    global bansdf
    resultsdf.to_csv('results2017-2.csv')
    picksdf.to_csv('picks2017-2.csv')
    bansdf.to_csv('bans2017-2.csv')
    eventsdf.to_csv('events2017-2.csv')
    teamineventdf.to_csv('teaminevents2017-2.csv')
    endsave = int(round(time.time() * 1000))
    saveduration = endsave - startsave
    print("saved, duration of save in milliseconds: "+str(saveduration))
        


## The script for the scraping

In [93]:
scrape()

scraping page: https://www.hltv.org/events/archive?offset=1500
event with eventcode: 2915 not scraped
event with eventcode: 2914 not scraped
event with eventcode: 2913 not scraped
results for event: 2911 scraped, maps scraped: 8, vetos scraped: 21
total results: 8|| total vetos: 21
event with eventcode: 2910 not scraped
results for event: 2757 scraped, maps scraped: 16, vetos scraped: 49
total results: 24|| total vetos: 70
results for event: 2898 scraped, maps scraped: 13, vetos scraped: 42
total results: 37|| total vetos: 112
results for event: 2724 scraped, maps scraped: 30, vetos scraped: 112
total results: 67|| total vetos: 224
results for event: 2722 scraped, maps scraped: 27, vetos scraped: 112
total results: 94|| total vetos: 336
results for event: 2841 scraped, maps scraped: 60, vetos scraped: 135
total results: 154|| total vetos: 471
results for event: 2860 scraped, maps scraped: 33, vetos scraped: 105
total results: 187|| total vetos: 576
results for event: 2872 scraped, maps

total requests so far: 1560
saved, duration of save in milliseconds: 29
scraping page: https://www.hltv.org/events/archive?offset=1600
results for event: 2782 scraped, maps scraped: 35, vetos scraped: 0
total results: 2131|| total vetos: 4774
event with eventcode: 2829 not scraped
event with eventcode: 2817 not scraped
event with eventcode: 2824 not scraped
results for event: 2793 scraped, maps scraped: 9, vetos scraped: 49
total results: 2140|| total vetos: 4823
event with eventcode: 2823 not scraped
results for event: 2790 scraped, maps scraped: 17, vetos scraped: 105
total results: 2157|| total vetos: 4928
event with eventcode: 2785 not scraped
results for event: 2786 scraped, maps scraped: 19, vetos scraped: 105
total results: 2176|| total vetos: 5033
event with eventcode: 2811 not scraped
results for event: 2641 scraped, maps scraped: 35, vetos scraped: 91
total results: 2211|| total vetos: 5124
event with eventcode: 2773 not scraped
event with eventcode: 2810 not scraped
event wi

event with eventcode: 2676 not scraped
event with eventcode: 2685 not scraped
results for event: 2659 scraped, maps scraped: 34, vetos scraped: 98
total results: 3963|| total vetos: 9792
results for event: 2664 scraped, maps scraped: 11, vetos scraped: 20
total results: 3974|| total vetos: 9812
results for event: 2671 scraped, maps scraped: 14, vetos scraped: 63
total results: 3988|| total vetos: 9875
results for event: 2682 scraped, maps scraped: 17, vetos scraped: 49
total results: 4005|| total vetos: 9924
event with eventcode: 2670 not scraped
event with eventcode: 2677 not scraped
results for event: 2657 scraped, maps scraped: 13, vetos scraped: 42
total results: 4018|| total vetos: 9966
results for event: 2655 scraped, maps scraped: 44, vetos scraped: 119
total results: 4062|| total vetos: 10085
event with eventcode: 2665 not scraped
event with eventcode: 2678 not scraped
results for event: 2649 scraped, maps scraped: 45, vetos scraped: 189
total results: 4107|| total vetos: 10274

event with eventcode: 2563 not scraped
event with eventcode: 2558 not scraped
results for event: 2560 scraped, maps scraped: 5, vetos scraped: 14
total results: 5413|| total vetos: 14330
results for event: 2554 scraped, maps scraped: 19, vetos scraped: 56
total results: 5432|| total vetos: 14386
results for event: 2551 scraped, maps scraped: 18, vetos scraped: 70
total results: 5450|| total vetos: 14456
event with eventcode: 2476 not scraped
results for event: 2432 scraped, maps scraped: 59, vetos scraped: 206
total results: 5509|| total vetos: 14662
results for event: 2556 scraped, maps scraped: 13, vetos scraped: 35
total results: 5522|| total vetos: 14697
event with eventcode: 2548 not scraped
event with eventcode: 2552 not scraped
results for event: 2504 scraped, maps scraped: 30, vetos scraped: 112
total results: 5552|| total vetos: 14809
results for event: 2431 scraped, maps scraped: 34, vetos scraped: 98
total results: 5586|| total vetos: 14907
event with eventcode: 2447 not scr

results for event: 2348 scraped, maps scraped: 39, vetos scraped: 122
total results: 7068|| total vetos: 18038
event with eventcode: 2405 not scraped
event with eventcode: 2404 not scraped
event with eventcode: 2389 not scraped
results for event: 2325 scraped, maps scraped: 43, vetos scraped: 188
total results: 7111|| total vetos: 18226
results for event: 2403 scraped, maps scraped: 5, vetos scraped: 0
total results: 7116|| total vetos: 18226
results for event: 2428 scraped, maps scraped: 6, vetos scraped: 42
total results: 7122|| total vetos: 18268
event with eventcode: 2424 not scraped
event with eventcode: 2422 not scraped
results for event: 2262 scraped, maps scraped: 69, vetos scraped: 54
total results: 7191|| total vetos: 18322
results for event: 2402 scraped, maps scraped: 29, vetos scraped: 42
total results: 7220|| total vetos: 18364
results for event: 2401 scraped, maps scraped: 15, vetos scraped: 0
total results: 7235|| total vetos: 18364
results for event: 2412 scraped, maps

## Combining the csv files

In [97]:
csvs = ["events", "bans", "picks", "results", "teaminevents"]

for csv in csvs:
    path1 = "HLTV project/csvfiles/2018/"+csv+"2018.csv"
    path2 = "HLTV project/csvfiles/2017/"+csv+"2017.csv"
    finalpath = "HLTV project/csvfiles/2017-2019/"+csv+".csv"
    pd.read_csv(path1).append(pd.read_csv(path2), ignore_index=True).to_csv(finalpath)

## Tests
I tested the scrapeEvent function, and the scrapeResult function. It gave the output I expected.

In [36]:
scrapeEvent("https://www.hltv.org/events/4376/qiwi-teamplay-season-3")
print(eventsdf)
print(teamineventdf)

scrapeResults(4322)
print(resultsdf)
print(picksdf)
print(bansdf)

  matchcode     map         team1     team2 score1 score2 eventcode stars  \
0   2330292  Mirage  PELAFUSTANES  Rejected     16     14      4322     0   
1   2330292   Dust2  PELAFUSTANES  Rejected     16      4      4322     0   

                   date mapnumber  
0  13th of January 2019         1  
1  13th of January 2019         2  
  matchcode          team     map priority
0   2330292      Rejected  Mirage        3
1   2330292  PELAFUSTANES   Dust2        4
  matchcode          team       map priority
0   2330292      Rejected     Cache        1
1   2330292  PELAFUSTANES   Inferno        2
2   2330292      Rejected     Train        5
3   2330292  PELAFUSTANES  Overpass        6
