# Capology Player Web Scraping 

Notebook to scrape raw data from [Capology](https://www.capology.com) using Beatifulsoup and Selenium. 

In this version I will scrape only the EPL data. Check the original version [here](https://github.com/eddwebster/football_analytics/blob/master/notebooks/1_data_scraping/Capology%20Player%20Salary%20Web%20Scraping.ipynb) to find out data on more legues.

# 1. Notebook Setup

## Libraries and Modules

In [3]:
import platform
import sys, getopt
assert sys.version_info >= (3,5)
import csv

%matplotlib inline

#Math operation
import numpy as np
from math import pi

#Datetime
import datetime
from datetime import date 
import time

#Data Processing
import pandas as pd
import random
from io import BytesIO
from pathlib import Path

#Reading directories
import glob
import os

#Working with JSON
import json
from pandas.io.json import json_normalize

#Web Scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import re

#Data Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import missingno as msno

#Progresso Bar
import tqdm as tqdm

#Display in Jupyter
from IPython.display import Image,YouTubeVideo
from IPython.core.display import HTML

#Ignore warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

print('Setup Complete')



Setup Complete


## Defined Variables and Lists 

### Today's Date

In [5]:
#Define today's date
today_date = datetime.datetime.now().strftime('%d/%m/%Y').replace('/','')
today_date

'01052023'

### Season

In [9]:
season = 2020  # '2020' stands for 20/21 season

#'Full season' and 'short season' string
full_season_string = str(int(season))+'/'+str(int(season)+1)
short_season_string = str(str(int(season))[-2:])+'/'+str(str(int(season)+1)[-2:])
full_season_string,short_season_string

('2020/2021', '20/21')

### Scraping Variables

In [10]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

### Teams and Legues 

In [11]:
#Premier Legue teams by season

## 2016-2017 PL
lst_teams_pl_1617 = ['arsenal', 'bournemouth', 'burnley', 'chelsea', 'crystal-palace', 'everton',
             'hull-city', 'leicester', 'liverpool', 'manchester-city', 'manchester-united',
             'middlesbrough', 'southampton', 'stoke-city', 'sunderland', 'swansea', 'tottenham',
             'watford', 'west-bromwich', 'west-ham']

## 2017-2018 PL
lst_teams_pl_1718 = ['arsenal', 'bournemouth', 'brighton', 'burnley', 'chelsea', 'crystal-palace', 'everton',
             'huddersfield', 'leicester', 'liverpool', 'manchester-city', 'manchester-united',
             'newcastle', 'southampton', 'stoke-city', 'swansea', 'tottenham',
             'watford', 'west-bromwich', 'west-ham']

## 2018-2019 PL
lst_teams_pl_1819 = ['arsenal', 'bournemouth', 'brighton', 'burnley', 'cardiff', 'chelsea',
             'crystal-palace', 'everton', 'fulham', 'huddersfield', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'southampton', 'tottenham', 'watford', 'west-ham', 'wolverhampton']

## 2019-2020 PL
lst_teams_pl_1920 = ['arsenal', 'aston-villa', 'bournemouth', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'norwich', 'sheffield-united', 'southampton', 'tottenham', 'watford',
             'west-ham', 'wolverhampton']

## 2020-2021 PL
lst_teams_pl_2021 = ['arsenal', 'aston-villa', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'fulham', 'leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle',
             'sheffield-united', 'southampton', 'tottenham', 'west-bromwich',
             'west-ham', 'wolverhampton']

## 2021-2022 PL
lst_teams_pl_2122 = ['arsenal', 'aston-villa', 'brentford', 'brighton', 'burnley', 'chelsea',
             'crystal-palace', 'everton', 'leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle', 'norwich',
             'southampton', 'tottenham', 'watford', 'west-ham', 'wolverhampton']

## 2022-2023 PL
lst_teams_pl_2223 = ['arsenal', 'aston-villa', 'bournemouth','brentford', 'brighton', 'chelsea',
             'crystal-palace', 'everton', 'fulham','leeds', 'leicester',
             'liverpool', 'manchester-city', 'manchester-united', 'newcastle', 'nottingham-forest',
             'southampton', 'tottenham', 'west-ham', 'wolverhampton']

### Season

In [13]:
lst_seasons = ['2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

## Define filepaths

In [14]:
os.getcwd()

'/home/edoardo/Desktop/IoT_2022/Football-Analysis/eddwebster_notebooks'

In [15]:
base_dir = os.path.join('..')

data_dir = os.path.join(base_dir, 'data')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

data_dir_capology = os.path.join(data_dir, 'capology')
if not os.path.exists(data_dir_capology):
    os.mkdir(data_dir_capology)

## Custom Functions (Scrapers)
Two different scrapers:

1. Previous seasons (```scrape_capology_season_prev```)
2. Current seasons (slightly different webpage structure, so needs to be different) (```scrape_capology_season_current```)

### Previous season scraper

In [39]:
def scrape_capology_seasons_prev(lst_teams, season, comp):

    #Print statement
    print(f'Scraping for {comp} for the {season} season has now started...')

    #Create empty list for DataFrame
    dfs_players = []

    #Check if the output folder exists
    raw_folder = os.path.join(data_dir_capology, 'raw')
    if not os.path.join(raw_folder):
        os.mkdir(raw_folder)
    
    comp_folder = os.path.join(raw_folder, f'{comp}')
    if not os.path.join(comp_folder):
        os.mkdir(comp_folder)

    season_folder = os.path.join(raw_folder, f'{season}')
    if not os.path.join(season_folder):
        os.mkdir(season_folder)


    for team in lst_teams:
        csv_file_path = os.path.join(season_folder, f'{team}_{comp}_{season}.csv')
        if not os.path.exists(csv_file_path):
            #Define the url
            url = f'https://www.capology.com/club/{team}/salaries/{season}/'
            #Print statement
            print(f'Scraping {team} for the {season} season')

            wd = webdriver.Chrome('chromedriver', options=options)
            wd.get(url)
            html = wd.page_source
            time.sleep(5)
            html = wd.page_source
            df = pd.read_html(html, header=0)[1]
            print(df.head())
            ### Data Engineering
            #Remove the first row
            #df = df.iloc[1:,:]   
            
            #Rename the columns
            df = df.rename(columns = df.iloc[0])
            df.columns = list(
                                map(
                                    lambda col: col if not('Pos' in col) and not('Country' in col)\
                                        else ('Position' if 'Pos' in col else 'Country'), list(df.columns)
                                )
                            )
            print(df.columns)
            #Remove the last column
            df = df[:-1]
            #print(df.head())
            #Remove the first row
            df = df.iloc[1:,:]
            #print(df.head())
            #reset the index 
            df = df.reset_index()
            #print(df.head())
            #Drop the index and the Rank columns
            df = df.drop(['index', 'Rank'], axis = 1)
            #Add custom columns
            df['Team'] = team
            df['Team'] = df['Team'].str.replace('-', ' ').str.title().str.replace('Fc', 'FC').str.replace('Ac', 'AC')
            df['League'] = comp
            df['League'] = df['League'].str.reaplce('-', ' ').str.title()
            df['Season'] = season
            print(f'Saving DataFrame of {team} for the {season} season')

            ### Save to csv
            df.to_csv(csv_file_path)

            ### Append to joint DataFrame
            dfs_players.append(df)
        else:
            df = pd.read_csv(csv_file_path, index_col=None, header=0)
            print(f'{team} already scraped and saved for the {season} season')

            ### Append to joint DataFrame
            dfs_players.append(df)

    ### Concatenate all the DFS
    df_players_all = pd.concat(dfs_players)

    ### Engineer unified data
    df_players_all['Team'] = df_players_all['Team'].str.replace('-',' ').str.title().str.replace('Fc','FC')
    df_players_all['Season'] = df_players_all['Season'].str.replace('-',' ').str.title()

    #Save to csv
    df_players_all.to_csv(os.path.join(season_folder,f'all_{comp}_{season}.csv'))

    ### Print statement 
    print(f'Scraping for {comp} for the {season} season is now complete')

    return df_players_all

In [40]:
cols = ['Player', 'Gross P/W(GBP)', 'Gross P/Y(GBP)', 'Adj. Gross(GBP)',
       'Pos. DFKM', 'Age',
       "Country BelgiumBosnia-HerzegovinaBrazilCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireCote d'IvoireEgyptEnglandFranceGabonGermanyGhanaJapanNorwayPortugalScotlandSpainSwitzerland"]

new_cols = list(
    map(
        lambda col: col if not('Pos' in col) and not('Country' in col)\
            else ('Position' if 'Pos' in col else 'Country'), cols
    )
)
new_cols

['Player',
 'Gross P/W(GBP)',
 'Gross P/Y(GBP)',
 'Adj. Gross(GBP)',
 'Position',
 'Age',
 'Country']

In [41]:
scrape_capology_seasons_prev(lst_teams_pl_1617, '2021-2022', 'premier-league')


Scraping for premier-league for the 2021-2022 season has now started...
Scraping arsenal for the 2021-2022 season
                  Unnamed: 0  ...                                              Bio.2
0                     Player  ...  Country BelgiumBosnia-HerzegovinaBrazilCote d'...
1  Pierre-Emerick Aubameyang  ...                                              Gabon
2              Thomas Partey  ...                                              Ghana
3        Alexandre Lacazette  ...                                             France
4               Nicolas Pépé  ...                                      Cote d'Ivoire

[5 rows x 7 columns]
Index(['Player', 'Gross P/W(GBP)', 'Gross P/Y(GBP)', 'Adj. Gross(GBP)',
       'Position', 'Age', 'Country'],
      dtype='object')


KeyError: "['Rank'] not found in axis"

### Current season scraper