## This is a web scrapping for raw data using BeautifulSoup

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from itertools import chain

Set up global variables and Dataframe from pandas to store values that we are going to retrieve/scrap from and use for loop to repeat it

In [None]:
players_scraped = []
url = "https://sofifa.com/players?offset=1"
res = requests.get(url)
text = res.text
soup = BeautifulSoup(text, "html.parser")
tbody = soup.find("tbody", {"class": "list"})
trs = tbody.findAll("tr")
players_scraped.append([extract_info(tr) for tr in trs])

# Convert list of lists to single list
flatten = lambda x: list(chain.from_iterable(x))

# Convert to df
df = pd.DataFrame(flatten(players_scraped))
df['value'] = df['value'].apply(convert_into_val)
df.drop_duplicates(inplace=True, ignore_index=True)

df.head()

In [2]:
def extract_info(tr):
    return {
        "name": tr.select('td.col-name')[0].find("a").get("aria-label"),
        "country": tr.select('td.col-name')[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": tr.select("td.col-name")[1].find("a").text,
        # "height": tr.select('td.col.col-hi')[0].text.strip(),
        # "weight": tr.select('td.col.col-wi')[0].text.strip(),
        # "foot": tr.select('td.col.col-pf')[0].text.strip(),
        "best_position": tr.select('td.col-name')[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
        # "PAC": tr.select('td.col.col-pac')[0].text.strip(),
        # "SHO": tr.select('td.col.col-sho')[0].text.strip(),
        # "PAS": tr.select('td.col.col-pas')[0].text.strip(),
        # "DRI": tr.select('td.col.col-dir')[0].text.strip(),
        # "DEF": tr.select('td.col.col-def')[0].text.strip(),
        # "PHY": tr.select('td.col.col-phy')[0].text.strip()
    }

In [None]:
url = "https://sofifa.com/players?offset=1"
columns = ["ID", "Name", "Age", "Position", "Nationality", "Overall",
          "Potential", "Club", "Value", "Wage"]
data = pd.DataFrame(columns = columns)

src_code = requests.get(url)
plain_text = src_code.text
soup = BeautifulSoup(plain_text,"html.parser")
tbody = soup.find("tbody")
trs = tbody.findAll("tr")


In [None]:
base_url = "https://sofifa.com/players?offset="
columns = ["ID", "Name", "Age", "Position", "Nationality", "Overall",
          "Potential", "Club", "Value", "Wage"]

data = pd.DataFrame(columns = columns)
p_urls = []
for offset in range(0, 3):
    url = base_url + str(offset * 60)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    table_body = soup.find("tbody")
    base = "https://sofifa.com/"
    for row in table_body.findAll("tr"):
        p_link = tr.select('td.col-name')[0].find("a").get("href")
        p_url = base + p_link
        td = row.findAll("td")
        pid = td[0].find("img").get("id")
        nationality = td[1].find("img").get("title")
        name = td[1].find("a").get("aria-label")
        rel = td[1].findAll("a", {"rel" : "nofollow"})
        pos = rel[0].findAll("span")
        for span in pos:
            positions= (span.text.split) ## FIX HERE
        age = td[2].text
        overall = td[3].text.strip()
        potential = td[4].text.strip()
        club = td[5].find("a").text
        value = td[6].text.strip()
        wage = td[7].text.strip()
        # store the retrieved 
        player_data = pd.DataFrame([[pid, name, age, positions,nationality, overall,potential, club,value,wage]])
        player_data.columns = columns
        data = pd.concat([data, player_data])
    print("done for "+str(offset), end = "\r")
    p_urls.append(format(p_url))
data.drop_duplicates()
data.head()
data.to_csv('../data/player_data.csv', encoding='utf-8-sig')
    

done for 2

In [None]:
# Use this to deep scrape info (NOT DONE) check later)
# You need to first go surface to scrap and get new url within page 
# of the players and start scraping from there
# So might a different lists or dictionaries to store new urls
# And finish scarping it

In [44]:
base = "https://sofifa.com/"
base_url = "https://sofifa.com/players?offset="
p_urls = []
for offset in range(0, 5):
    url = base_url + str(offset * 60)
    res = requests.get(url)
    text = res.text
    soup = BeautifulSoup(text, "html.parser")
    tab_body = soup.find("tbody")
    trs = tab_body.findAll("tr")
    for tr in trs2:
        link = tr.select('td.col-name')[0].find("a").get("href")
        p_url = base + link
        if (p_url not in p_urls):
            p_urls.append(format(p_url))
p_urls

['https://sofifa.com//player/208574/filip-kostic/220051/',
 'https://sofifa.com//player/195479/james-tavernier/220051/',
 'https://sofifa.com//player/257540/ansgar-knauff/220051/',
 'https://sofifa.com//player/188943/kevin-trapp/220051/',
 'https://sofifa.com//player/224520/ryan-kent/220051/',
 'https://sofifa.com//player/231044/joe-aribo/220051/',
 'https://sofifa.com//player/236403/evan-ndicka/220051/',
 'https://sofifa.com//player/247090/enzo-fernandez/220051/',
 'https://sofifa.com//player/181098/makoto-hasebe/220051/',
 'https://sofifa.com//player/247182/lucas-silva-melo/220051/',
 'https://sofifa.com//player/245741/jesper-lindstrom/220051/',
 'https://sofifa.com//player/240663/miguel-angel-merentiel/220051/',
 'https://sofifa.com//player/247377/marcos-antonio-silva-santos/220051/',
 'https://sofifa.com//player/253072/darwin-nunez/220051/',
 'https://sofifa.com//player/241637/aurelien-tchouameni/220051/',
 'https://sofifa.com//player/264240/pablo-martin-paez-gavira/220051/',
 'htt

In [32]:
tab_body = soup.find("tbody")
trs2 = tab_body.findAll("tr")
for tr in trs2:
    a = tr.select('td.col-name')[0].find("a").get("href")
base + a

'https://sofifa.com//player/246618/adam-hlozek/220051/'