In [2]:
# Import libraries
import logging
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import logging
from itertools import chain

In [3]:
import threading


class MultiThreading(object):

    def __init__(self, scrapers):
        self.scrapers = scrapers

    def run(self):
        threads = []

        for i in range(len(self.scrapers)):
            t = threading.Thread(target=self.scrapers[i].start)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()


In [4]:
class Scraper(object):
    """
    Pull player info down from web
    """

    # Initialize array to store player
    players_scraped = []

    # Instantiate scraper
    def __init__(self, urls):
        self.urls = urls
        self.logger = logging.getLogger("sLogger")

    # request to get the url
    def get_page(self, url):
        response = requests.get(url)
        if response.status_code:
            soup = BeautifulSoup(response.content, "html.parser")
            return soup.find("tbody", {"class": "list"})
        else:
            self.logger.error("Error" + response.status_code)
            return None

    # helper method to get players
    def get_players(self,trs):
        return [extract_info(tr) for tr in trs]
    
    # helper method to get stats
    def get_stats(self,trs):
        return [extract_stats(tr) for tr in trs]

    # method to extract and copy player info from web
    def scrap(self, urls):
        for url in urls:
            tbody = self.get_page(url)
            if tbody is None:
                continue
            trs = tbody.findAll("tr")
            Scraper.players_scraped.append(self.get_players(trs))
            self.logger.info("Page{} scraped".format(len(Scraper.players_scraped)))
    # method to start the scraper
    def start(self):
        self.scrap(self.urls)

In [43]:

url = "https://sofifa.com/players?offset="
urls = [url + str(offset) for offset in range(0, 20040, 60)]
# Parameters
number_of_scraper = 5
pages = 10

scrapers = [Scraper(urls[pages * i:min(pages * (i + 1), len(urls))]) 
            for i in range(number_of_scraper)]
multi_threading = MultiThreading(scrapers)

#     multi_threading.run()


'https://sofifa.com/players?offset=1980'

In [6]:
def flatten(d):
    out = {}
    for key, val in d.items():
        if isinstance(val, dict):
            val = [val]
        if isinstance(val, list):
            for subdict in val:
                deeper = flatten(subdict).items()
                out.update({key2: val2 for key2, val2 in deeper})
        else:
            out[key] = val
    return out

In [14]:
t1 = time.time()
multi_threading.run()

players = list(map(flatten, (chain.from_iterable(Scraper.players_scraped))))

df_multi_thread = pd.DataFrame(players)
df_multi_thread.drop_duplicates(ignore_index = True)
print("Multi threading time taken: ", time.time() - t1)
df_multi_thread.head()


Multi threading time taken:  50.39705157279968


Unnamed: 0,name,country,age,overall,potential,club,best_position,value,wage,crossing,...,Strength,Long Shots,Defensive Awareness,Standing Tackle,Sliding Tackle,Diving,Handling,Kicking,Positioning,Reflexes
0,Aurélien Tchouaméni,France,21,81,88,AS Monaco,CM,€52M,€46K,68,...,82,72,80,83,78,9,8,12,12,7
1,Jan Vennegoor of Hess.,Netherlands,32,71,71,PSV,12,€850K,€10K,44,...,88,70,23,41,38,11,6,15,7,5
2,Alessandro Bastoni,Italy,22,83,89,Inter,CB,€53M,€100K,70,...,81,37,87,87,81,11,5,9,11,12
3,Jurriën Timber,Netherlands,20,79,88,Ajax,CB,€36M,€13K,59,...,80,46,82,83,75,6,6,8,14,14
4,Charles De Ketelaere,Belgium,20,77,87,Club Brugge KV,ST,€23.5M,€22K,74,...,75,74,45,61,57,11,8,14,8,8


In [13]:
# helper method for extracting data of the player

def extract_info(tr):
    base = "https://sofifa.com/"
    link = base + tr.select('td.col-name')[0].find("a").get("href")
    return {
        "name": tr.select('td.col-name')[0].find("a").get("aria-label"),
        "country": tr.select('td.col-name')[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": tr.select("td.col-name")[1].find("a").text,
        "best_position": tr.select('td.col-name')[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
        "stats": extract_stats(link)
    }


# helper method for extracting stats of single player

def extract_stats(link):
    new_res = requests.get(link)
    new_soup = BeautifulSoup(new_res.content, "html.parser")
    new_tbody = new_soup.find_all("div", {"class": "center"})[5]
    stats_block = new_tbody.findAll("div", {"class": "block-quarter"})
    return extract_deep(stats_block)


def extract_deep(stats_block):
    return {"Att": extract_att(stats_block[0]),
            "Skill": extract_skill(stats_block[1]), 
            "Move": extract_move(stats_block[2]),
            "Power": extract_pow(stats_block[3]),
            "Mentality": extract_mentality(stats_block[4]),
            "Defending": extract_def(stats_block[5]),
            "Goalkeep": extract_goalkeep(stats_block[6])
            }

In [7]:
def extract_att(att):
    return {"crossing": att.findAll("li")[0].find("span").text.strip(),
            "Finishing": att.findAll("li")[1].find("span").text.strip(),
            "Heading Accuracy":  att.findAll("li")[2].find("span").text.strip(),
            "Short passing": att.findAll("li")[3].find("span").text.strip(),
            "Volleys": att.findAll("li")[4].find("span").text.strip()
            }


def extract_skill(ski):
    return {"Dribbling": ski.findAll("li")[0].find("span").text.strip(),
            "Curve": ski.findAll("li")[1].find("span").text.strip(),
            "Fk Accuracy": ski.findAll("li")[2].find("span").text.strip(),
            "Long Passing": ski.findAll("li")[3].find("span").text.strip(),
            "Ball Control": ski.findAll("li")[4].find("span").text.strip()
            }


def extract_move(mov):
    return {"Acceleration": mov.findAll("li")[0].find("span").text.strip(),
            "Sprint Speed": mov.findAll("li")[1].find("span").text.strip(),
            "Agility": mov.findAll("li")[2].find("span").text.strip(),
            "Reactions": mov.findAll("li")[3].find("span").text.strip(),
            "Balance": mov.findAll("li")[4].find("span").text.strip()
            }


def extract_pow(pow):
    return {"Shot Power": pow.findAll("li")[0].find("span").text.strip(),
            "Jumping": pow.findAll("li")[1].find("span").text.strip(),
            "Stamina": pow.findAll("li")[2].find("span").text.strip(),
            "Strength": pow.findAll("li")[3].find("span").text.strip(),
            "Long Shots": pow.findAll("li")[4].find("span").text.strip()
            }


def extract_mentality(men):
    return {"Aggression": men.findAll("li")[0].find("span").text.strip(),
            "Interceptions": men.findAll("li")[1].find("span").text.strip(),
            "Positioning": men.findAll("li")[2].find("span").text.strip(),
            "Vision": men.findAll("li")[3].find("span").text.strip(),
            "Penalties": men.findAll("li")[4].find("span").text.strip(),
            "Composure": men.findAll("li")[5].find("span").text.strip()
            }


def extract_def(defe):
    return {"Defensive Awareness": defe.findAll("li")[0].find("span").text.strip(),
            "Standing Tackle": defe.findAll("li")[1].find("span").text.strip(),
            "Sliding Tackle": defe.findAll("li")[2].find("span").text.strip()
            }


def extract_goalkeep(gk):
    return {"Diving": gk.findAll("li")[0].find("span").text.strip(),
            "Handling": gk.findAll("li")[1].find("span").text.strip(),
            "Kicking": gk.findAll("li")[2].find("span").text.strip(),
            "Positioning": gk.findAll("li")[3].find("span").text.strip(),
            "Reflexes": gk.findAll("li")[4].find("span").text.strip()
            }



## Prototype

- Refactor the Scraper class, so that it has different branch of statements to handle either scrap surface only or scrap deep only

In [None]:
pls = []
url = "https://sofifa.com/players?offset="
urls = [url + str(offset) for offset in range(0, 30, 10)]
for url in urls:
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "html.parser")
    tbody = soup.find("tbody", {"class": "list"})
    trs = tbody.findAll("tr")
    pls.append([extract_info(tr) for tr in trs])

####

first_url = pls[0][0]["link"]
first_url
r = requests.get(first_url)
sou = BeautifulSoup(r.content, "html.parser")
tb = soup.find_all("div", {"class": "center"})[5]
blo = tb.findAll("div", {"class" : "block-quarter"})
type(blo)

###

url = "https://sofifa.com/player/27488/jan-vennegoor-of-hess/120002/"
new_res = requests.get(url)
soup = BeautifulSoup(new_res.content, "html.parser")
tbody = soup.find_all("div", {"class": "center"})[5]
stats_block = tbody.findAll("div", {"class": "block-quarter"})

