In [15]:
# helpers (RUN FIRST)

def pipeline(df):
    df['value'] = df['value'].apply(convert_into_val)
     # Drop duplicates
    df.drop_duplicates(inplace=True, ignore_index=True)

def convert_into_val(value):
    value = value.strip('€')
    if value[-1] == 'M':
        return float(value[:-1]) * 1e6
    elif value[-1] == 'K':
        return float(value[:-1]) * 1e3
    else:
        return int(value)
    
def extract_info(tr):
    return {
        "name": tr.select('td.col-name')[0].find("a").get("aria-label"),
         "country": tr.select('td.col-name')[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": tr.select("td.col-name")[1].find("a").text,
        #"height": tr.select('td.col.col-hi')[0].text.strip(),
        #"weight": tr.select('td.col.col-wi')[0].text.strip(),
        #"foot": tr.select('td.col.col-pf')[0].text.strip(),
        "best_position": tr.select('td.col-name')[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
        #"PAC": tr.select('td.col.col-pac')[0].text.strip(),
        #"SHO": tr.select('td.col.col-sho')[0].text.strip(),
        #"PAS": tr.select('td.col.col-pas')[0].text.strip(),
        #"DRI": tr.select('td.col.col-dir')[0].text.strip(),
        #"DEF": tr.select('td.col.col-def')[0].text.strip(),
        #"PHY": tr.select('td.col.col-phy')[0].text.strip()
    }

# def flatten(x):
#     list(chain.from_iterable(x))
    
def job(url):
    r = requests.get(url)
    content = r.text
    soup = BeautifulSoup(content, "html.parser")
    tbody = soup.find("tbody", {"class": "list"})
    trs = tbody.findAll("tr")
    players_scraped.append([extract_info(tr) for tr in trs])


## Time comparison on scraping

In [21]:
import time 
import requests
import pandas as pd
from itertools import chain
from datetime import datetime
from bs4 import BeautifulSoup

# single thread 
t1 = time.time()
players_scraped = []


base_url = "https://sofifa.com/players?offset="
for offset in range(0, 100):
    url = base_url + str(offset * 60)
    job(url)

# Convert to df
flatten = lambda x: list(chain.from_iterable(x))
df = pd.DataFrame(flatten(players_scraped))
df.drop_duplicates(inplace=True, ignore_index=True)

print("Single thread time taken: ", time.time() - t1)
df.head()

Single thread time taken:  52.347819566726685


Unnamed: 0,name,country,age,overall,potential,club,best_position,value,wage
0,Branco van den Boomen,Netherlands,25,73,76,Toulouse Football Club,CM,€4M,€7K
1,Rafael da Conceição Leão,Portugal,22,82,90,AC Milan,LW,€68.5M,€52K
2,Theo Hernández,France,23,84,89,AC Milan,LB,€59.5M,€51K
3,Darwin Núñez,Uruguay,22,79,87,SL Benfica,ST,€39.5M,€16K
4,Paulo Dybala,Argentina,27,87,87,Juventus,CF,€86.5M,€160K


In [23]:
import time 
import requests
import threading

# Multi threading

base_url = "https://sofifa.com/players?offset="
def job_multi():
    for offest in range(0,100):
        url = base_url + str(offset*60)
    r = requests.get(url)
    content = r.text
    soup = BeautifulSoup(content, "html.parser")
    tbody = soup.find("tbody", {"class": "list"})
    trs = tbody.findAll("tr")
    players_scraped.append([extract_info(tr) for tr in trs])
    
t1 = time.time()
t_list = []
players_scraped_multi_thread = []

for i in range(100):
    t = threading.Thread(target=job_multi, args=())
    t_list.append(t)
    t.start()
    
for t in t_list:
    t.join()

flatten = lambda x: list(chain.from_iterable(x))
df_multi_thread = pd.DataFrame(flatten(players_scraped_multi_thread))
df_multi_thread.drop_duplicates(inplace=True, ignore_index=True)

print("Multi threading time taken: ", time.time() - t1)
df_multi_thread.head()

Multi threading time taken:  28.957924604415894


In [None]:
import requests
import time 
import multiprocessing
from multiprocessing import Pool
# Multi process

MAX_WORKER_NUM = multiprocessing.cpu_count()
def fetch():
    r = requests.get("https://sofifa.com/players?offset=1")
    # print(r.text)
if __name__ == "__main__":
    t1 = time.time()
    p = Pool(MAX_WORKER_NUM)
    for i in range(100):
        p.apply_async(fetch,args=())
    p.close()
    p.join()
    
    print("Multi process time taken: ", time.time() - t1)

In [24]:
import aiohttp
import asyncio
import time
import nest_asyncio

nest_asyncio.apply()

async def fetch(client):
    async with client.get("https://sofifa.com/players?offset=1") as resp:
        assert resp.status == 200
        return await resp.text()
async def main():
    async with aiohttp.ClientSession() as client:
        html = await fetch(client)
        #print(html)

loop = asyncio.get_event_loop()
tasks = []

for i in range(100):
    task = loop.create_task(main())
    tasks.append(task)

t1 = time.time()
loop.run_until_complete(main())

print("aiohttp time taken: ", time.time() - t1)

aiohttp time taken:  1.210798978805542
