import main
main.main()

In [1]:
import pickle

file_path = '../Data/data.pickle'

with open(file_path, 'rb') as f:
    data = pickle.load(f)

In [2]:
data.keys()

dict_keys(['MF', 'FW', 'FB', 'GK', 'AM', 'CB'])

In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame(data)

In [5]:
from modules import data_collection

In [6]:
current = data_collection.get_current_fpl()[0]

In [7]:
current[['web_name', 'element_type']].loc[current['web_name'] == 'Ward']

Unnamed: 0,web_name,element_type
251,Ward,2


In [8]:
df['AM'].count()

96

In [9]:
df['MF'].count()

114

In [10]:
MF = df['MF'].fillna(df['AM']).dropna()

In [11]:
FB = df['FB'].fillna(df['CB']).dropna()

In [12]:
FW = df['FW'].dropna()
GK = df['GK'].dropna()

In [13]:
def clean_df(df):
    df = df.apply(lambda x: pd.Series(x))
    names = df[0].apply(lambda x: x.split('/')[-1].replace('-',' '))
    df = df[1].apply(pd.Series).set_index(names)
    return df

In [14]:
for x in [FW, MF, FB, GK]:
    x = clean_df(x)

In [15]:
players = pd.concat([clean_df(FW),clean_df(MF),clean_df(FB)])
keepers = clean_df(GK)

In [16]:
curr_test = current[['total_points', 'element_type', 'first_name','second_name','web_name']]
curr_test.set_index(curr_test['first_name']+' '+curr_test['second_name'], inplace = True)

In [17]:
players = pd.merge(players, curr_test[['total_points','element_type']], left_index = True, right_index = True)
keepers = pd.merge(keepers, curr_test[['total_points','element_type']], left_index = True, right_index = True)

In [18]:
players

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,total_points,element_type
Aaron Cresswell,0.00,20,0.01,9,0.37,26,0.04,33,0.15,81,...,1.12,61,0.83,16,2.29,65,1.12,66,74,2
Aaron Hickey,0.00,20,0.02,21,0.54,43,0.05,38,0.02,1,...,0.83,32,0.88,20,1.46,23,0.63,28,63,2
Adam Lallana,0.25,58,0.11,8,1.00,4,0.12,28,0.12,19,...,0.25,15,0.75,27,0.12,8,1.00,89,47,3
Adam Smith,0.00,20,0.02,15,0.16,5,0.03,25,0.04,9,...,0.79,27,0.94,25,2.52,76,0.85,45,74,2
Adam Webster,0.00,21,0.08,88,0.84,90,0.00,28,0.01,24,...,0.89,18,1.41,74,2.53,9,2.76,82,75,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Willy Boly,0.00,21,0.00,1,0.15,6,0.15,99,0.03,69,...,1.76,84,1.76,92,5.72,98,2.05,50,37,2
Yoane Wissa,0.42,64,0.33,45,1.90,12,0.18,69,0.15,69,...,0.36,85,0.59,40,0.83,58,1.30,31,111,4
Yoane Wissa,0.42,91,0.33,89,1.90,37,0.18,47,0.15,31,...,0.36,31,0.59,12,0.83,86,1.30,93,111,4
Youri Tielemans,0.08,58,0.04,22,1.12,59,0.04,28,0.17,87,...,1.08,52,1.40,73,0.92,43,1.04,62,86,3


KEEPERS
* PSxG-GA
* Goals Against
* Save Percentage
* PSxG/SoT
* Clean Sheet Percentage
* Touches
* Launch %
* Goal Kicks
* Avg. Length of Goal Kicks
* Crosses Stopped %
* Def. Actions Outside Pen. Area
* Avg. Distance of Def. Actions

In [27]:
import pickle
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import re


def scrape_fbref_scouting_reports(retry):
    if retry is None:
        retry = 0
    else:
        print(f'waiting for {retry} seconds')
        sleep(retry)
    print('start') 
    # URL for the EPL page on FBref.com
    url = 'https://fbref.com/en/comps/9/wages/Premier-League-Wages'
    response = requests.get(url, headers = {'User-agent': 'fpl_test'})
    if response.status_code == 200:
        print('Finding Players')
        soup = BeautifulSoup(re.sub("<!--|-->","", str(response.content)), "lxml")
        table = soup.find("table",{"id":"player_wages"})
        
        player_urls = [a['href'] for a in table.select('tbody tr td[data-stat="player"] a')]
        
        scouting_report = {'MF': [],'FW': [],'FB': [], 'GK': [], 'AM': [], 'CB': []}
        request_count = 0
        print('Loop Started')
        for player_url in player_urls:
            full_url = 'https://fbref.com' + player_url
            if request_count >= 25:
                
                request_count = 0
                print('Sleeping to avoid 429 status code')
                sleep(30)

            player_response = requests.get(full_url)
            if int(player_response.status_code) == 429:
                print(f"Sleeping for: {player_response.headers['Retry-After']}")
                sleep(int(player_response.headers['Retry-After']))
                player_response = requests.get(full_url)

            request_count += 1
            if player_response.status_code == 200:
                player_soup = BeautifulSoup(player_response.content, 'html.parser')
                
                # Add check against FPL Data
                for pos in ['MF','FW','FB','GK','AM','CB']:
                    scouting_table = player_soup.find('table', {'id': f'scout_summary_{pos}'})
                    if scouting_table is not None:
                        scouting_report[pos].append((player_url, [data.text for data in scouting_table.select('td')]))
                    else:
                        scouting_report[pos].append(None)

                print(f'Success: {full_url}')
            else:
                print(f"Failed to retrieve data for player: {full_url} | {player_response.status_code}")

            date = datetime.now()
            file_path = f"FBREF_SCOUT_{date.strftime('%Y%m%d')}.pickle"

            # Save the object to a pickle file
            with open(file_path, 'wb') as f:
                    pickle.dump(scouting_report, f)
            
        print("Scraping completed. Scouting reports saved")
    else:
        print("Failed to retrieve data from FBref.com")
        print(response.status_code)
        print(response.headers['Retry-After'])
        print(type(response.headers['Retry-After']))
        scrape_fbref_scouting_reports(int(response.headers['Retry-After']))
