### Working aiothhtp



#### First, create offsets of urls

In [42]:
# List to store new created offset urls
all_urls = []
# Base url that serves as preffix of all offset urls
offset_base = "https://sofifa.com/players?offset="
# Loop to append offset number at very end of the offset base
# Define a n to set range of when to stop adding
n = 20040 # n should be large enough
for offset in range(0, n, 60):
    all_urls.append(offset_base + str(offset))

In [6]:
# Prints first 5 links 
all_urls[0:5]

['https://sofifa.com/players?offset=0',
 'https://sofifa.com/players?offset=60',
 'https://sofifa.com/players?offset=120',
 'https://sofifa.com/players?offset=180',
 'https://sofifa.com/players?offset=240']

#### Next, try assgining number of offset links for each scraper.
For example:
    
   Let $S_{x}$ be scraper $x$, and let all_urls = $O$ s.t. :
       
   - $S_1$ scraps O[0:10]
   - $S_2$ scraps O[11:20]
   - ...
   - $S_x$ scraps O[x-10 : x]

This way we could assign to $x$ numbers of scrapers certain numbers of offset links evenly distributed, and each scraper just parse the links they have. 

In [3]:
import asyncio
import time
import aiohttp
from bloom_filter import BloomFilter
import lxml.html as lx

In [4]:
# This needs to be run if in interactive kernel like jupyter
import nest_asyncio
nest_asyncio.apply()

In [43]:
# Instatiate list to store hyperlinks of each individual player
player_links = []
# Instatiate bloomfilter to check duplicates of player links
bloom = BloomFilter(max_elements=30000, error_rate=0.1)

##### Function to download (change this word) each url

In [44]:
async def download(url):
    async with aiohttp.ClientSession() as session:
        url = await fetch(session, url)
        await parse_offset(url)

##### Function to fetch and requests to GET

In [45]:
async def fetch(session, url):
    async with session.get(url) as response:
        assert response.status == 200
        return await response.text(encoding="utf-8")

##### Function to parse each url (This case our offset link)

In [None]:
async def parse_offset(url):
    doc = lx.fromstring(url)
    out = listing()
    for path in out:
        href = doc.xpath(path)[0]
        # This checks the version (2 digits) of the player 
        ver = href.split(sep="/")[4][0:2]
        # If the player is from older version that isnt 22 then ignore
        if ver != "22":
            continue
        # Defines prefix of the p_url that shares everyone
        base = "https://sofifa.com"
        p_url = base + href
        # Checks if this player if already contains in our bloom set to check duplicate
        if bloom.__contains__(p_url):
            # print(f"This url is duplicated {p_url}")
            continue
        player_links.append(p_url)
        bloom.add(p_url)
        

###### Helper function for parse_offset

In [46]:
def listing():
    out = []
    ref = '//*[@id="body"]/div[1]/div/div[2]/div/table/tbody/tr[i]/td[2]/a[1]/@href'
    old = ref.split(sep="/")[9]
    for i in range(60):
        new = f"tr[{i+1}]"
        p = ref.replace(old, new)
        out.append(p)
    return out

#### Running the asynchronus process with coroutine

##### Changes the variable of n to control length of all_urls list

In [48]:
# List to store new created offset urls
all_urls = []
# Base url that serves as preffix of all offset urls
offset_base = "https://sofifa.com/players?offset="
# Loop to append offset number at very end of the offset base
# Define a n to set range of when to stop adding
n = 120 # n should be large enough
for offset in range(0, n, 60):
    all_urls.append(offset_base + str(offset))

##### Actual running

In [169]:
class Scraper():
    def __init__(self):
        self._player_links = []
        self._offset_links = []
        self._listing = []
        self._bloom = BloomFilter(max_elements=30000, error_rate=0.1)
    
    def listing(self):
        ref = '//*[@id="body"]/div[1]/div/div[2]/div/table/tbody/tr[i]/td[2]/a[1]/@href'
        old = ref.split(sep="/")[9]
        for i in range(60):
            new = f"tr[{i+1}]"
            p = ref.replace(old, new)
            self._listing.append(p)
        #return self._listing
    
    def get_offsets(self, n):
        # Base url that serves as preffix of all offset urls
        offset_base = "https://sofifa.com/players?offset="
        # Loop to append offset number at very end of the offset base
        # Define a n to set range of when to stop adding
        # n should be large enough
        for offset in range(0, n, 60):
            self._offset_links.append(offset_base + str(offset))
            
    async def download(self, url):
        async with aiohttp.ClientSession() as session:
            url = await fetch(session, url)
            await parse_offset(url)
   
    async def fetch(self, session, url):
        async with session.get(url) as response:
            assert response.status == 200
            return await response.text(encoding="utf-8")
    
    async def parse_offset(self, url):
        doc = lx.fromstring(url)
        assert len(out) != 0
        out = self._listing
        for path in out:
            href = doc.xpath(path)[0]
            # This checks the version (2 digits) of the player 
            ver = href.split(sep="/")[4][0:2]
            # If the player is from older version that isnt 22 then ignore
            if ver != "22":
                continue
            # Defines prefix of the p_url that shares everyone
            base = "https://sofifa.com"
            p_url = base + href
            # Checks if this player if already contains in our bloom set to check duplicate
            if self._bloom.__contains__(p_url):
                # print(f"This url is duplicated {p_url}")
                continue
            self._player_links.append(p_url)
            self._bloom.add(p_url)
    
    @property
    def player_links(self):
        return self._player_links
    @property
    def bloom(self):
        return self._bloom
    @property
    def offset_links(self):
        return self._offset_links
    @property
    def lists(self):
        return self._listing

In [170]:
print("#" * 20)
# starts timer
# Use this function to clear existing bloom and player links'
sc = Scraper()
t1 = time.time()
sc.get_offsets(n=120)
sc.listing()
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(sc.download(url)) for url in sc.offset_links]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
# ends timer
t2 = time.time()
print(f"Using coroutine took: {t2 - t1} s")
print("#" * 20)


####################
Using coroutine took: 0.4755258560180664 s
####################


In [171]:
sc.player_links

[]

###### helper to clear exisiting variables

In [32]:
def clear_storage():
    # Instatiate list to store hyperlinks of each individual player
    player_links = []
    # Instatiate bloomfilter to check duplicates of player links
    bloom = BloomFilter(max_elements=30000, error_rate=0.1)

### Others 