# Import packages

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import pymongo
from pymongo import MongoClient
 

# Scrape data

In [2]:
#scraping basic info

url0 = "https://sofifa.com/players?offset="
columns = ['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special']
data = pd.DataFrame(columns = columns)

#60 players displayed per page (355 pages)
for offset in range(0, 355):
    
    url_basic = url0 + str(offset * 60)
    source_code = requests.get(url_basic)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    table_body = soup.find('tbody')
    
    
    for row in table_body.findAll('tr'):
        td = row.findAll('td')
        picture = td[0].find('img').get('data-src')
        pid = td[0].find('img').get('id')
        nationality = td[1].find('img').get('title')
        flag_img = td[1].find('img').get('data-src')
        name = td[1].find("a").get("data-tooltip")
        age = td[2].text
        overall = td[3].text.strip()
        potential = td[4].text.strip()
        club = td[5].find('a').text
        club_logo = td[5].find('img').get('data-src')
        value = td[6].text.strip()
        wage = td[7].text.strip()
        special = td[8].text.strip()
        player_data = pd.DataFrame([[pid, name, age, picture, nationality, flag_img, overall, potential, club, club_logo, value, wage, special]])
        player_data.columns = columns
        data = data.append(player_data, ignore_index=True)
    print("done for "+str(offset),end="\r")

data = data.drop_duplicates()
data.head()


done for 354

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special
0,212198,Bruno Miguel Borges Fernandes,25,https://cdn.sofifa.com/players/212/198/21_60.png,Portugal,https://cdn.sofifa.com/flags/pt.png,88,91,Manchester United,https://cdn.sofifa.com/teams/11/30.png,€121M,€240K,2347
1,232363,Milan Škriniar,25,https://cdn.sofifa.com/players/232/363/21_60.png,Slovakia,https://cdn.sofifa.com/flags/sk.png,86,88,Inter,https://cdn.sofifa.com/teams/44/30.png,€75.5M,€140K,1839
2,261025,Dane Scarlett,16,https://cdn.sofifa.com/players/261/025/21_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,62,88,Tottenham Hotspur,https://cdn.sofifa.com/teams/18/30.png,€1.6M,€3K,1486
3,253473,Samuele Ricci,18,https://cdn.sofifa.com/players/253/473/21_60.png,Italy,https://cdn.sofifa.com/flags/it.png,67,85,Empoli,https://cdn.sofifa.com/teams/1746/30.png,€2.6M,€600,1769
4,225719,Kelechi Iheanacho,23,https://cdn.sofifa.com/players/225/719/21_60.png,Nigeria,https://cdn.sofifa.com/flags/ng.png,76,81,Leicester City,https://cdn.sofifa.com/teams/95/30.png,€11.5M,€63K,1928


In [6]:
#more detailed information requires the player specific page
detailed_columns = ['Preferred Foot','Weak Foot','Skill Moves','International Reputation','Work Rate','Body Type','Real Face','Release Clause','Position','Jersey Number','Joined','Contract Valid Until','Height','Weight','LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM','LM','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB','GK','Likes','Dislikes','Following','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys','Dribbling','Curve','FK Accuracy','Long Passing','Ball Control','Acceleration','Sprint Speed','Agility','Reactions','Balance','Shot Power','Jumping','Stamina','Strength','Long Shots','Aggression','Interceptions','Positioning','Vision','Penalties','Composure','Defensive Awareness','Standing Tackle','Sliding Tackle','GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes']
detailed_data = pd.DataFrame(index = range(0, data.count()[0]), columns = detailed_columns)
detailed_data["ID"] = data["ID"].values

player_url = 'https://sofifa.com/player/'
count = 0
for id in data["ID"]:
    url = player_url + str(id)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    skill_map = {}
    columns = soup.find("div", {"class":"columns"})
    columns12 = columns.find_all("div",{"class":"column col-12"})
    for column in columns12:
        skills = column.findAll('li')
        for skill in skills:
            if(skill.find('label') != None):
                label = skill.find('label').text
                value = skill.text.replace(label, '').strip()
                skill_map[label] = value
    meta_data = soup.find('div', {'class': 'meta'}).text.split(' ')
    length = len(meta_data)
    weight = meta_data[length - 1]
    height = meta_data[length - 2].split('\'')[0] + '\'' + meta_data[length - 2].split('\'')[1].split('\"')[0]
    skill_map["Height"] = height
    skill_map['Weight'] = weight
    if('Position' in skill_map.keys()):
        if skill_map['Position'] in ('', 'RES', 'SUB'):
            skill_map['Position'] = soup.find('div', {'class': 'meta bp3-text-overflow-ellipsis'}).find('span').text
        if(skill_map['Position'] != 'GK'):
            card_rows = soup.find("div",{"class":"lineup"}).find_all("div",{"class":"column col-sm-2"})
            for attribute in card_rows:
                if(attribute.find('div')):
                    name = ''.join(re.findall('[a-zA-Z]', attribute.text))
                    value = attribute.text.replace(name, '').strip()
                    skill_map[str(name)] = value
    skill_map["Likes"] = columns12[3].find("button",{"class":"bp3-button like-btn need-sign-in"}).find("span",{"class":"count"}).text
    skill_map["Dislikes"] = columns12[3].find("button",{"class":"bp3-button dislike-btn need-sign-in"}).find("span",{"class":"count"}).text
    skill_map["Following"] = columns12[3].find("button",{"rel":"nofollow"}).find("span",{"class":"count"}).text
    name = []
    value = []
    columns3 = columns.find_all("ul",{"class":"pl"})
    switch = 0
    for column in columns3[3:]:
        for li in column.find_all("li"):
            text = li.text
            name.append(text[2:].strip(" ").rstrip())
            value.append(text[:2].strip(" ").rstrip())
    for name, value in zip(name[:-2],value[:-2]):
        skill_map[name] = value
    count = count + 1
    print("Loaded so far: "+str(count)+"/"+str(data.shape[0]), end="\r")
    for key, value in skill_map.items():
        detailed_data.loc[detailed_data["ID"] == id, key] = value

Loaded so far: 68/18250

KeyboardInterrupt: 

In [None]:
full_data = pd.merge(data, detailed_data.iloc[:,:79], how = 'inner', on = 'ID')
full_data

# Send to MongoDB

In [5]:
#Send to MongoDB
client = pymongo.MongoClient("mongodb+srv://bryanodonohoe:mongomongo@cluster0.laxah.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
 

db = client.bryan_db1
print(db)
coll = db.fifa_player_ratings

coll.insert_many(full_data.apply(lambda x: x.to_dict(), axis=1).to_list())

Database(MongoClient(host=['cluster0-shard-00-00.laxah.mongodb.net:27017', 'cluster0-shard-00-01.laxah.mongodb.net:27017', 'cluster0-shard-00-02.laxah.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-i83ime-shard-0', ssl=True), 'bryan_db1')


<pymongo.results.InsertManyResult at 0x22b5aef49c0>