In [120]:
# Import libraries
import logging
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import logging
from itertools import chain

In [135]:
import threading


class MultiThreading(object):

    def __init__(self, scrapers):
        self.scrapers = scrapers

    def run(self):
        threads = []

        for i in range(len(self.scrapers)):
            t = threading.Thread(target=self.scrapers[i].start)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()

In [144]:
class Scraper(object):
    """
    Pull player info down from web
    """

    # Initialize array to store player (surface)
    players_scraped = []

    # Initialize array to store player stats (deep)
    stats_scraped = []

    # Instantiate scraper
    def __init__(self, urls):
        self.urls = urls
        self.logger = logging.getLogger("sLogger")

    # request to get the url
    def get_page(self, url):
        response = requests.get(url)
        if response.status_code:
            soup = BeautifulSoup(response.content, "html.parser")
            return soup.find("tbody", {"class": "list"})
        else:
            self.logger.error("Error" + response.status_code)
            return None

    # helper method to get players
    def get_players(self,trs):
        return [extract_info(tr) for tr in trs]
    
    # helper method to get stats
    def get_stats(self,trs):
        return [extract_stats(tr) for tr in trs]

    # method to extract and copy player info from web
    def scrap(self, urls):
        for url in urls:
            tbody = self.get_page(url)
            if tbody is None:
                continue
            trs = tbody.findAll("tr")
            Scraper.players_scraped.append(self.get_players(trs))
            self.logger.info("Page{} scraped".format(len(Scraper.players_scraped)))
    # method to start the scraper
    def start(self):
        self.scrap(self.urls)

In [145]:
params = {"ae": "0", "oa": "1", "pt": "2", "vl": "3", "wg": "4", "bp": "5"}
query = "&".join([f"showCol%5B{y}%5D={x}" for x, y in params.items()])
url = f"https://sofifa.com/players?{query}&offset="
urls = [url + str(offset) for offset in range(0, 120, 60)]
# Parameters
number_of_scraper = 31
pages = 10

scrapers = [Scraper(urls[pages * i:min(pages * (i + 1), len(urls))]) 
            for i in range(number_of_scraper)]
multi_threading = MultiThreading(scrapers)

#     multi_threading.run()


In [146]:
t1 = time.time()
multi_threading.run()
flatten = lambda x: list(chain.from_iterable(x))
df_multi_thread = pd.DataFrame(flatten(Scraper.players_scraped))
df_multi_thread.drop_duplicates(inplace=True, ignore_index=True)
p_urls = df_multi_thread[["link"]].values.tolist()
df_multi_thread = df_multi_thread.drop("link",axis = 1)

print("Multi threading time taken: ", time.time() - t1)
df_multi_thread.head()


Multi threading time taken:  1.7391860485076904


Unnamed: 0,name,country,age,overall,potential,club,best_position,value,wage
0,Josh Sargent,United States,21,69,74,Norwich City,RM,€2.1M,€12K
1,Mauro Icardi,Argentina,28,79,79,Paris Saint-Germain,ST,€18M,€88K
2,Alfie Devine,England,16,60,85,Tottenham Hotspur,CAM,€775K,€1K
3,Kylian Mbappé,France,22,91,95,Paris Saint-Germain,ST,€194M,€230K
4,Filip Ugrinic,Switzerland,22,74,83,FC Luzern,CM,€9.5M,€13K


In [165]:
test_url = p_urls[0][0]
test_url

#stats = []
res = requests.get(test_url)
text = res.text
soup = BeautifulSoup(text, "html.parser")
tbody = soup.find_all("div", {"class":"center"})[5]

stats_block = tbody.findAll("div", {"class":"block-quarter"})
stats = extract_stats(stats_block)

half = Scraper.players_scraped[0][0]
half

## Testing creation of df


{'link': 'https://sofifa.com//player/242075/josh-sargent/220053/',
 'name': 'Josh Sargent',
 'country': 'United States',
 'age': '21',
 'overall': '69',
 'potential': '74',
 'club': 'Norwich City',
 'best_position': 'RM',
 'value': '€2.1M',
 'wage': '€12K'}

In [83]:
# helper method for extracting data of the player

def extract_info(tr):
    base = "https://sofifa.com/"
    return {
        "link": base + tr.select('td.col-name')[0].find("a").get("href"),
        "name": tr.select('td.col-name')[0].find("a").get("aria-label"),
        "country": tr.select('td.col-name')[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": tr.select("td.col-name")[1].find("a").text,
        "best_position": tr.select('td.col-name')[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
    }
    

# helper method for extracting stats of single player

# Based on these codes:
# tbody = soup.find_all("div", {"class":"center"})[5]
# tr = tbody.findAll("div", {"class":"block-quarter"})
 # stats = extract_stats(tr)
def extract_stats(tr):
    return {"Att": extract_att(tr[0]) 
    , "Skill" : extract_skill(tr[1]) 
    , "Move" : extract_move(tr[2])
    , "Power" : extract_pow(tr[3])
    , "Mentality" : extract_mentality(tr[4])
    , "Defending" : extract_def(tr[5])
    , "Goalkeep" : extract_goalkeep(tr[6])} 
    

In [63]:
def extract_att(tr):
    return {"Crossing" : tr.findAll("li")[0].find("span").text.strip(),
           "Finishing" : tr.findAll("li")[1].find("span").text.strip(),
           "Heading Accuracy":  tr.findAll("li")[2].find("span").text.strip(),
           "Short passing":tr.findAll("li")[3].find("span").text.strip(),
           "Volleys": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_skill(tr):
    return {"Dribbling" : tr.findAll("li")[0].find("span").text.strip(),
           "Curve" : tr.findAll("li")[1].find("span").text.strip(),
           "Fk Accuracy": tr.findAll("li")[2].find("span").text.strip(),
           "Long Passing": tr.findAll("li")[3].find("span").text.strip(),
           "Ball Control": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_move(tr):
    return {"Acceleration" : tr.findAll("li")[0].find("span").text.strip(),
           "Sprint Speed" : tr.findAll("li")[1].find("span").text.strip(),
           "Agility": tr.findAll("li")[2].find("span").text.strip(),
           "Reactions": tr.findAll("li")[3].find("span").text.strip(),
           "Balance": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_pow(tr):
    return {"Shot Power" : tr.findAll("li")[0].find("span").text.strip(),
           "Jumping" : tr.findAll("li")[1].find("span").text.strip(),
           "Stamina": tr.findAll("li")[2].find("span").text.strip(),
           "Strength": tr.findAll("li")[3].find("span").text.strip(),
           "Long Shots": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_mentality(tr):
    return {"Aggression" : tr.findAll("li")[0].find("span").text.strip(),
           "Interceptions" : tr.findAll("li")[1].find("span").text.strip(),
           "Positioning": tr.findAll("li")[2].find("span").text.strip(),
           "Vision": tr.findAll("li")[3].find("span").text.strip(),
           "Penalties": tr.findAll("li")[4].find("span").text.strip(),
           "Composure": tr.findAll("li")[5].find("span").text.strip()
            }

def extract_def(tr):
    return {"Defensive Awareness" : tr.findAll("li")[0].find("span").text.strip(),
           "Standing Tackle" : tr.findAll("li")[1].find("span").text.strip(),
           "Sliding Tackle": tr.findAll("li")[2].find("span").text.strip()
            }

def extract_goalkeep(tr):
    return {"Diving" : tr.findAll("li")[0].find("span").text.strip(),
           "Handling" : tr.findAll("li")[1].find("span").text.strip(),
           "Kicking": tr.findAll("li")[2].find("span").text.strip(),
           "Positioning": tr.findAll("li")[3].find("span").text.strip(),
           "Reflexes": tr.findAll("li")[4].find("span").text.strip()
            }
    


## Prototype

- Refactor the Scraper class, so that it has different branch of statements to handle either scrap surface only or scrap deep only