# Capology Player Web Scraping 

Notebook to scrape raw data from [Capology](https://www.capology.com) using Beatifulsoup and Selenium. 

In this version I will scrape only the EPL data. Check the original version [here](https://github.com/eddwebster/football_analytics/blob/master/notebooks/1_data_scraping/Capology%20Player%20Salary%20Web%20Scraping.ipynb) to find out data on more legues.

# 1. Notebook Setup

## Libraries and Modules

In [1]:
import platform
import sys, getopt
assert sys.version_info >= (3,5)
import csv

%matplotlib inline

#Math operation
import numpy as np
from math import pi

#Datetime
import datetime
from datetime import date 
import time

#Data Processing
import pandas as pd
import random
from io import BytesIO
from pathlib import Path

#Reading directories
import glob
import os

#Working with JSON
import json
from pandas.io.json import json_normalize

#Web Scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import re

#Data Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import missingno as msno

#Progresso Bar
import tqdm as tqdm

#Display in Jupyter
from IPython.display import Image,YouTubeVideo
from IPython.core.display import HTML

#Ignore warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

print('Setup Complete')



Setup Complete


## Defined Variables and Lists 

### Today's Date

In [44]:
#Define today's date
today_date = datetime.datetime.now().strftime('%d/%m/%Y').replace('/','')
today_date

'02062023'

### Season

In [3]:
season = 2020  # '2020' stands for 20/21 season

#'Full season' and 'short season' string
full_season_string = str(int(season))+'/'+str(int(season)+1)
short_season_string = str(str(int(season))[-2:])+'/'+str(str(int(season)+1)[-2:])
full_season_string,short_season_string

('2020/2021', '20/21')

### Scraping Variables

In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

### Teams and Legues 

In [5]:
#Premier Legue teams by season

## 2016-2017 PL
lst_teams_pl_1617 = ['arsenal', 'bournemouth', 'burnley', 'chelsea', 'crystal-palace', 'everton',
             'hull-city', 'leicester', 'liverpool', 'manchester-city', 'manchester-united',
             'middlesbrough', 'southampton', 'stoke-city', 'sunderland', 'swansea', 'tottenham',
             'watford', 'west-bromwich', 'west-ham']

## 2017-2018 PL
lst_teams_pl_1718 = ['arsenal', 'bournemouth', 'brighton', 'burnley', 'chelsea', 'crystal-palace', 'everton',
             'huddersfield', 'leicester', 'liverpool', 'manchester-city', 'manchester-united',
             'newcastle', 'southampton', 'stoke-city', 'swansea', 'tottenham',
             'watford', 'west-bromwich', 'west-ham']

## 2018-2019 PL
lst_teams_pl_1819 = ['arsenal', 'bournemouth', 'brighton', 'burnley', 'cardiff', 'chelsea',
             'crystal-palace', 'everton', 'fulham', 'huddersfield', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'southampton', 'tottenham', 'watford', 'west-ham', 'wolverhampton']

## 2019-2020 PL
lst_teams_pl_1920 = ['arsenal', 'aston-villa', 'bournemouth', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'norwich', 'sheffield-united', 'southampton', 'tottenham', 'watford',
             'west-ham', 'wolverhampton']

## 2020-2021 PL
lst_teams_pl_2021 = ['arsenal', 'aston-villa', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'fulham', 'leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'sheffield-united', 'southampton', 'tottenham', 'west-bromwich',
             'west-ham', 'wolverhampton']

## 2021-2022 PL
lst_teams_pl_2122 = ['arsenal', 'aston-villa', 'brentford', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle', 'norwich',
             'southampton', 'tottenham', 'watford', 'west-ham', 'wolverhampton']

## 2022-2023 PL
lst_teams_pl_2223 = ['arsenal', 'aston-villa', 'bournemouth','brentford', 'brighton', 'chelsea',
             'crystal-palace', 'everton', 'fulham','leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle', 'nottingham-forest',
             'southampton', 'tottenham', 'west-ham', 'wolverhampton']

### Season

In [6]:
lst_seasons = ['2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

## Define filepaths

In [7]:
os.getcwd()

'/home/edoardo/Desktop/GitHub/Football-Analysis/eddwebster_notebooks'

In [8]:
base_dir = os.path.join('..')

data_dir = os.path.join(base_dir, 'data')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

data_dir_capology = os.path.join(data_dir, 'capology')
if not os.path.exists(data_dir_capology):
    os.mkdir(data_dir_capology)

## Custom Functions (Scrapers)
Two different scrapers:

1. Previous seasons (```scrape_capology_season_prev```)
2. Current seasons (slightly different webpage structure, so needs to be different) (```scrape_capology_season_current```)

### Previous season scraper

In [38]:
def scrape_capology_seasons_prev(data_dir_capology, lst_teams, season, comp):

    #Print statement
    print(f'Scraping for {comp} for the {season} season has now started...')

    #Create empty list for DataFrame
    dfs_players = []

    #Create the output directory
    season_folder = os.path.join(data_dir_capology, 'raw', f'{comp}', f'{season}')
    Path(season_folder).mkdir(parents=True, exist_ok=True)


    for team in lst_teams:
        csv_file_path = os.path.join(season_folder, f'{team}_{comp}_{season}.csv')
        if not os.path.exists(csv_file_path):
            #Define the url
            url = f'https://www.capology.com/club/{team}/salaries/{season}/'
            #Print statement
            print(f'Scraping {team} for the {season} season')

            wd = webdriver.Chrome('chromedriver', options=options)
            wd.get(url)
            html = wd.page_source
            time.sleep(5)
            html = wd.page_source
            #Get the last df since not all the pages have the same lenght
            df = pd.read_html(html, header=0)[-1]
            
            ### Data Engineering
            #Rename the columns
            df = df.rename(columns = df.iloc[0])
            df.columns = list(
                                map(
                                    lambda col: col if not('Pos' in col) and not('Country' in col)\
                                        else ('Position' if 'Pos' in col else 'Country'), list(df.columns)
                                )
                            )

            #Remove the first row and the last row with totals
            df = df.iloc[1:-1,:]

            #Reset the index 
            df = df.reset_index()

            #Drop the index and the Rank columns
            df = df.drop(['index'], axis = 1)
            #Add custom columns
            df['Team'] = team
            df['Team'] = df['Team'].str.replace('-', ' ').str.title().str.replace('Fc', 'FC').str.replace('Ac', 'AC')
            df['League'] = comp
            df['League'] = df['League'].str.replace('-', ' ').str.title()
            df['Season'] = season
            print(f'Saving DataFrame of {team} for the {season} season')

            ### Save to csv
            df.to_csv(csv_file_path)

            ### Append to joint DataFrame
            dfs_players.append(df)
        else:
            df = pd.read_csv(csv_file_path, index_col=None, header=0)
            print(f'{team} already scraped and saved for the {season} season')

            ### Append to joint DataFrame
            dfs_players.append(df)

    ### Concatenate all the DFS
    df_players_all = pd.concat(dfs_players)

    ### Engineer unified data
    df_players_all['Team'] = df_players_all['Team'].str.replace('-',' ').str.title().str.replace('Fc','FC')
    df_players_all['Season'] = df_players_all['Season'].str.replace('-',' ').str.title()

    #Save to csv
    df_players_all.to_csv(os.path.join(season_folder,f'all_{comp}_{season}.csv'))

    ### Print statement 
    print(f'Scraping for {comp} for the {season} season is now complete')

    return df_players_all

In [None]:
scrape_capology_seasons_prev(data_dir_capology = data_dir_capology,
                             lst_teams = lst_teams_pl_2021,
                             season='2020-2021', 
                             comp='premier-league')


### Current season scraper

In [110]:
# Define function for scraping a defined season of Capology data
def scrape_capology_season_current(data_dir_capology,lst_teams, season, comp):


    ### Print statement
    print(f'Scraping for {comp} for the {season} season has now started...')
    
    ## Create empty list for DataFrame
    dfs_players = []
    
    #Create the output directory
    season_folder = os.path.join(data_dir_capology, 'raw', f'{comp}', f'{season}')
    Path(season_folder).mkdir(parents=True, exist_ok=True)

    for team in lst_teams:
        if not os.path.exists(os.path.join(data_dir_capology + f'/raw/{comp}/{season}/{team}_{comp}_{season}_last_updated_{today_date}.csv')):

            url = f'https://www.capology.com/club/{team}/salaries/{season}/'
            select_element_tag = 'select'
            print(f'Scraping {team} for the {season} season')
            wd = webdriver.Chrome('chromedriver', options=options)
            wd.get(url)
            html = wd.page_source
            time.sleep(4)
            html = wd.page_source
            df = pd.read_html(html, header=0, attrs = {'id': 'table'})[-1]
            
            #Data Engineering
            df = df.rename(columns = df.iloc[0])
            df = df.iloc[1:]

            new_columns_names = [
                'Player',
                'Verified',
                'Gross P/W(GBP)',
                'Gross P/W(GBP)',
                'Gross P/W(GBP)',
                'Signed',
                'Contract Expiration',
                'Years Remaining',
                'Gross Remaining(GBP)',
                'Release Clause(GBP)',
                'Position',
                'Detailed Position',
                'Age',
                'Status',
                'Country',
                'Active',
                'Loan'
            ]

            df.columns = new_columns_names
            df = df[:-1] 
            

            ### Create new columns
            df['Team'] = team
            df['Team'] = df['Team'].str.replace('-', ' ').str.title().str.replace('Fc', 'FC').str.replace('Ac', 'AC')
            df['League'] = comp
            df['League'] = df['League'].str.replace('-', ' ').str.title()
            df['Season'] = season
            print(f'Saving DataFrame of {team} for the {season} season')

            ### Save to CSV
            df.to_csv(data_dir_capology + f'/raw/{comp}/{season}/{team}_{comp}_{season}_last_updated_{today_date}.csv')

            ### Append to joint DataFrame
            dfs_players.append(df)
        else:
            df = pd.read_csv(data_dir_capology + f'/raw/{comp}/{season}/{team}_{comp}_{season}_last_updated_{today_date}.csv', index_col=None, header=0)
            print(f'{team} already scraped and saved for the {season} season')

            ### Append to joint DataFrame
            dfs_players.append(df)
        
    ## Concatenate DataFrames to one DataFrame
    df_players_all = pd.concat(dfs_players)

    ## Engineer unified data
    df_players_all['Team'] = df_players_all['Team'].str.replace('-', ' ').str.title().str.replace('Fc', 'FC')
    df_players_all['League'] = df_players_all['League'].str.replace('-', ' ').str.title()
    df_players_all = df_players_all.drop(df.columns[1], axis=1)

    ## Save to CSV
    df_players_all.to_csv(data_dir_capology + f'/raw/{comp}/{season}/all_{comp}_{season}_last_updated_{todays_date}.csv')
    
    ### Print statement
    print(f'Scraping for {comp} for the {season} season is now complete')
    
    ## Return unified season dataset
    return df_players_all

In [112]:
scrape_capology_season_current(data_dir_capology = data_dir_capology,
                             lst_teams = lst_teams_pl_2223,
                             season='2022-2023', 
                             comp='premier-league')


Scraping for premier-league for the 2022-2023 season has now started...
Scraping arsenal for the 2022-2023 season
Saving DataFrame of arsenal for the 2022-2023 season
Scraping aston-villa for the 2022-2023 season
Saving DataFrame of aston-villa for the 2022-2023 season
Scraping bournemouth for the 2022-2023 season
Saving DataFrame of bournemouth for the 2022-2023 season
Scraping brentford for the 2022-2023 season
Saving DataFrame of brentford for the 2022-2023 season
Scraping brighton for the 2022-2023 season
Saving DataFrame of brighton for the 2022-2023 season
Scraping chelsea for the 2022-2023 season
Saving DataFrame of chelsea for the 2022-2023 season
Scraping crystal-palace for the 2022-2023 season
Saving DataFrame of crystal-palace for the 2022-2023 season
Scraping everton for the 2022-2023 season
Saving DataFrame of everton for the 2022-2023 season
Scraping fulham for the 2022-2023 season
Saving DataFrame of fulham for the 2022-2023 season
Scraping leeds for the 2022-2023 season

NameError: name 'todays_date' is not defined

In [105]:
prova = df[0].copy()
prova = prova.rename(columns = prova.iloc[0])
prova = prova.iloc[1:]
#prova = prova.dropna(axis=1, how='all')

new_columns_names = [
    'Player',
    'Verified',
    'Gross P/W(GBP)',
    'Gross P/W(GBP)',
    'Gross P/W(GBP)',
    'Signed',
    'Contract Expiration',
    'Years Remaining',
    'Gross Remaining(GBP)',
    'Release Clause(GBP)',
    'Position',
    'Detailed Position',
    'Age',
    'Status',
    'Country',
    'Active',
    'Loan'
]

prova.columns = new_columns_names
prova = prova[:-1] 
prova.tail()

[('Player', 'Player'), (nan, 'Verified'), ('Gross P/W(GBP)', 'Gross P/W(GBP)'), ('Gross P/Y(GBP)', 'Gross P/W(GBP)'), ('Gross P/Y(GBP)', 'Gross P/W(GBP)'), ('Signed', 'Signed'), ('Contract Expiration\xa0Jun 30, 2023Jun 30, 2024Jun 30, 2025Jun 30, 2026Jun 30, 2027Jun 30, 2028', 'Contract Expiration'), ('Years Remaining\xa0123456', 'Years Remaining'), ('Gross Remaining(GBP)', 'Gross Remaining(GBP)'), ('Release Clause(GBP)', 'Release Clause(GBP)'), ('Pos.\xa0DFKM', 'Position'), ('Pos.\xa0AMCBCFDMGKLBLWRBRMRW', 'Detailed Position'), ('Age', 'Age'), ('Status\xa0ReserveStarter', 'Status'), ('Country', 'Country'), ('Active', 'Active'), ('Loan', 'Loan')]


Unnamed: 0,Player,Verified,Gross P/W(GBP),Gross P/W(GBP).1,Gross P/W(GBP).2,Signed,Contract Expiration,Years Remaining,Gross Remaining(GBP),Release Clause(GBP),Position,Detailed Position,Age,Status,Country,Active,Loan
19,Emile Smith Rowe,,"£ 40,000","£ 2,080,000",,"Jul 22, 2021","Jun 30, 2026",4,"£ 8,320,000",,F,AM,22,Reserve,England,,
20,Rob Holding,,"£ 40,000","£ 2,080,000",,"Jan 12, 2021","Jun 30, 2024",2,"£ 4,160,000",,D,CB,27,Reserve,England,,
21,William Saliba,,"£ 40,000","£ 2,080,000",,"Jul 25, 2019","Jun 30, 2024",2,"£ 4,160,000",,D,CB,21,Starter,France,,
22,Matt Turner,,"£ 35,000","£ 1,820,000","£ 660,000","Jul 1, 2022","Jun 30, 2025",3,"£ 5,460,000",,K,GK,28,Reserve,United States,,
23,Reiss Nelson,,"£ 15,000","£ 780,000",,"Aug 31, 2018","Jun 30, 2023",1,"£ 780,000",,F,RW,23,Reserve,England,,
