## This is a web scrapping for raw data using BeautifulSoup

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from itertools import chain

Set up global variables and Dataframe from pandas to store values that we are going to retrieve/scrap from and use for loop to repeat it

In [23]:
players_scraped = []
url = "https://sofifa.com/players?offset=1"
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")
tbody = soup.find("tbody", {"class": "list"})
trs = tbody.findAll("tr")
players_scraped.append([extract_info(tr) for tr in trs])

# Convert list of lists to single list
# flatten = lambda x: list(chain.from_iterable(x))

# # Convert to df
# df = pd.DataFrame(flatten(players_scraped))
# df.drop_duplicates(inplace=True, ignore_index=True)

# df.head()

In [22]:
def extract_info(tr):
    base = "https://sofifa.com/"
    link = base + tr.select('td.col-name')[0].find("a").get("href")
    return {
        "stats" : extract_stats(link),
        "name": tr.select('td.col-name')[0].find("a").get("aria-label"),
        "country": tr.select('td.col-name')[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": tr.select("td.col-name")[1].find("a").text,
        "best_position": tr.select('td.col-name')[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
    }

In [21]:
def extract_stats(link):
    new_res = requests.get(link)
    soup = BeautifulSoup(new_res.content, "html.parser")
    tbody = soup.find_all("div", {"class":"center"})[5]
    stats_block = tbody.findAll("div", {"class":"block-quarter"})
    return extract_deep(stats_block)

def extract_deep(tr):
    return {"Att": extract_att(tr[0]) 
        , "Skill" : extract_skill(tr[1]) 
        , "Move" : extract_move(tr[2])
        , "Power" : extract_pow(tr[3])
        , "Mentality" : extract_mentality(tr[4])
        , "Defending" : extract_def(tr[5])
        , "Goalkeep" : extract_goalkeep(tr[6])} 
    
    

In [11]:
def extract_att(tr):
    return {"Crossing" : tr.findAll("li")[0].find("span").text.strip(),
           "Finishing" : tr.findAll("li")[1].find("span").text.strip(),
           "Heading Accuracy":  tr.findAll("li")[2].find("span").text.strip(),
           "Short passing":tr.findAll("li")[3].find("span").text.strip(),
           "Volleys": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_skill(tr):
    return {"Dribbling" : tr.findAll("li")[0].find("span").text.strip(),
           "Curve" : tr.findAll("li")[1].find("span").text.strip(),
           "Fk Accuracy": tr.findAll("li")[2].find("span").text.strip(),
           "Long Passing": tr.findAll("li")[3].find("span").text.strip(),
           "Ball Control": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_move(tr):
    return {"Acceleration" : tr.findAll("li")[0].find("span").text.strip(),
           "Sprint Speed" : tr.findAll("li")[1].find("span").text.strip(),
           "Agility": tr.findAll("li")[2].find("span").text.strip(),
           "Reactions": tr.findAll("li")[3].find("span").text.strip(),
           "Balance": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_pow(tr):
    return {"Shot Power" : tr.findAll("li")[0].find("span").text.strip(),
           "Jumping" : tr.findAll("li")[1].find("span").text.strip(),
           "Stamina": tr.findAll("li")[2].find("span").text.strip(),
           "Strength": tr.findAll("li")[3].find("span").text.strip(),
           "Long Shots": tr.findAll("li")[4].find("span").text.strip()
            }

def extract_mentality(tr):
    return {"Aggression" : tr.findAll("li")[0].find("span").text.strip(),
           "Interceptions" : tr.findAll("li")[1].find("span").text.strip(),
           "Positioning": tr.findAll("li")[2].find("span").text.strip(),
           "Vision": tr.findAll("li")[3].find("span").text.strip(),
           "Penalties": tr.findAll("li")[4].find("span").text.strip(),
           "Composure": tr.findAll("li")[5].find("span").text.strip()
            }

def extract_def(tr):
    return {"Defensive Awareness" : tr.findAll("li")[0].find("span").text.strip(),
           "Standing Tackle" : tr.findAll("li")[1].find("span").text.strip(),
           "Sliding Tackle": tr.findAll("li")[2].find("span").text.strip()
            }

def extract_goalkeep(tr):
    return {"Diving" : tr.findAll("li")[0].find("span").text.strip(),
           "Handling" : tr.findAll("li")[1].find("span").text.strip(),
           "Kicking": tr.findAll("li")[2].find("span").text.strip(),
           "Positioning": tr.findAll("li")[3].find("span").text.strip(),
           "Reflexes": tr.findAll("li")[4].find("span").text.strip()
            }
    


In [None]:
url = "https://sofifa.com/players?offset=1"
columns = ["ID", "Name", "Age", "Position", "Nationality", "Overall",
          "Potential", "Club", "Value", "Wage"]
data = pd.DataFrame(columns = columns)

src_code = requests.get(url)
plain_text = src_code.text
soup = BeautifulSoup(plain_text,"html.parser")
tbody = soup.find("tbody")
trs = tbody.findAll("tr")


In [None]:
base_url = "https://sofifa.com/players?offset="
columns = ["ID", "Name", "Age", "Position", "Nationality", "Overall",
          "Potential", "Club", "Value", "Wage"]

data = pd.DataFrame(columns = columns)
p_urls = []
for offset in range(0, 3):
    url = base_url + str(offset * 60)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    table_body = soup.find("tbody")
    base = "https://sofifa.com/"
    for row in table_body.findAll("tr"):
        p_link = tr.select('td.col-name')[0].find("a").get("href")
        p_url = base + p_link
        td = row.findAll("td")
        pid = td[0].find("img").get("id")
        nationality = td[1].find("img").get("title")
        name = td[1].find("a").get("aria-label")
        rel = td[1].findAll("a", {"rel" : "nofollow"})
        pos = rel[0].findAll("span")
        for span in pos:
            positions= (span.text.split) ## FIX HERE
        age = td[2].text
        overall = td[3].text.strip()
        potential = td[4].text.strip()
        club = td[5].find("a").text
        value = td[6].text.strip()
        wage = td[7].text.strip()
        # store the retrieved 
        player_data = pd.DataFrame([[pid, name, age, positions,nationality, overall,potential, club,value,wage]])
        player_data.columns = columns
        data = pd.concat([data, player_data])
    print("done for "+str(offset), end = "\r")
    p_urls.append(format(p_url))
data.drop_duplicates()
data.head()
data.to_csv('../data/player_data.csv', encoding='utf-8-sig')
    

done for 2

In [None]:
# Use this to deep scrape info (NOT DONE) check later)
# You need to first go surface to scrap and get new url within page 
# of the players and start scraping from there
# So might a different lists or dictionaries to store new urls
# And finish scarping it

In [1]:
base = "https://sofifa.com/"
base_url = "https://sofifa.com/players?offset="
p_urls = []
for offset in range(0, 5):
    url = base_url + str(offset * 60)
    res = requests.get(url)
    text = res.text
    soup = BeautifulSoup(text, "html.parser")
    tab_body = soup.find("tbody")
    trs = tab_body.findAll("tr")
    for tr in trs2:
        link = tr.select('td.col-name')[0].find("a").get("href")
        p_url = base + link
        if (p_url not in p_urls):
            p_urls.append(format(p_url))


NameError: name 'requests' is not defined

In [32]:
tab_body = soup.find("tbody")
trs2 = tab_body.findAll("tr")
for tr in trs2:
    a = tr.select('td.col-name')[0].find("a").get("href")
base + a

'https://sofifa.com//player/246618/adam-hlozek/220051/'

In [None]:
#att_block = stats_block[0]
#skill_block = stats_block[1]
#movement_block = stats_block[2]
#power_block = stats_block[3]
#mentality_block = stats_block[4]
#defending_block = stats_block[5]
#goalkeeping_block = stats_block[6]

#tr = skill_block
#tr.findAll("li")[0]
#stats.append([extract_stats(tr) for tr in trs])

# Convert list of lists to single list
#flatten = lambda x: list(chain.from_iterable(x))

# Convert to df
# df = pd.DataFrame(flatten(stats))
# df.drop_duplicates(inplace=True, ignore_index=True)

# df.head()
