In [1]:
from dotenv import load_dotenv
import requests
import os
from bs4 import BeautifulSoup
from bs2json import bs2json
import pandas as pd 
from selenium import webdriver

# Field Player scraper

In [83]:
# Dataframes for appending the values
shooting_df = pd.DataFrame(columns=["sofifa_id", "shots_on_target_pct", "goals", "shots_on_target", "shots_total", "average_shot_distance", "shots_free_kicks", "pens_made", "pens_att"])

passing_df = pd.DataFrame(columns=["sofifa_id", "passes_pct", "passes_completed", "passes_total_distance", "passes_progressive_distance", "passes_pct_short", "passes_pct_medium", "passes_pct_long" ,"assists", "assisted_shots", "passes_into_final_third", "passes_into_penalty_area", "crosses_into_penalty_area", "progressive_passes"])

defending_df = pd.DataFrame(columns=["sofifa_id", "tackles", "tackles_won", "dribble_tackles_pct", "pressure_regain_pct", "blocks", "interceptions", "clearances", "errors"])

dribbling_df = pd.DataFrame(columns=["sofifa_id", "dribbles_completed_pct", "players_dribbled_past", "carries", "carry_distance", "carry_progressive_distance", "miscontrols", "dispossessed"])

others_df = pd.DataFrame(columns=["sofifa_id", "points_per_match", "plus_minus", "plus_minus_wowy"])

miscel_df = pd.DataFrame(columns=["sofifa_id", "cards_yellow", "cards_red", "cards_yellow_red", "fouls", "fouled", "offsides", "crosses", "pens_won", "pens_conceded", "own_goals", "ball_recoveries", "aerials_won_pct"])

In [76]:
# Key = table, V = columns I need to take from the table 
tables_cols = {1: ["shots_on_target_pct", "goals", "shots_on_target", "shots_total", "average_shot_distance", "shots_free_kicks", "pens_made", "pens_att"],
               2: ["passes_pct", "passes_completed", "passes_total_distance", "passes_progressive_distance", "passes_pct_short", "passes_pct_medium", "passes_pct_long" ,"assists", "assisted_shots", "passes_into_final_third", "passes_into_penalty_area", "crosses_into_penalty_area", "progressive_passes"],
               5: ["tackles", "tackles_won", "dribble_tackles_pct", "pressure_regain_pct", "blocks", "interceptions", "clearances", "errors"],
               6: ["dribbles_completed_pct", "players_dribbled_past", "carries", "carry_distance", "carry_progressive_distance", "miscontrols", "dispossessed"],
               7: ["points_per_match", "plus_minus", "plus_minus_wowy"],
               8: ["cards_yellow", "cards_red", "cards_yellow_red", "fouls", "fouled", "offsides", "crosses", "pens_won", "pens_conceded", "own_goals", "ball_recoveries", "aerials_won_pct"]}

In [77]:
def field_player_scrap(player_id, link, df_to_fill, scrap_cols):
    tab_index = {'shooting_df': 1, 'passing_df': 2, 'defending_df': 5, 'dribbling_df': 6, 'others_df': 7, 'miscel_df': 8}
    index = tab_index[df_to_fill]

    data = requests.get(link)
    player = BeautifulSoup(data.text, 'html.parser')
    # All tables
    tables = player.find_all('div', {'class': 'section_wrapper'})
    # Convert the table to a json
    converter = bs2json()
    
        # Select the table from the json
    table_s = converter.convert(tables[index])
        # Table as html again
    table = BeautifulSoup(table_s['div']['text'], 'html.parser')
        # Select the table in mode 'Club Collapsed'
    t_collapsed = table.find_all('div', {'class': 'table_wrapper'})[1]
        # Empty list for placing the values to append to the dataframe
    
    values = [player_id]

    for col in scrap_cols[index]:
            # Select the season 
        row_tag = [season_row for season_row in t_collapsed.find('tbody').find_all('tr') if season_row.find('th', {'data-stat': 'season'}).text == '2018-2019'][0]
        try:
            values.append(float(row_tag.find('td', {'data-stat': col}).text))
        except:
            values.append(0.0)

        # Return list with values
    return values
    


In [91]:
link = 'https://fbref.com/en/players/56b5d64e/all_comps/Yacine-Brahimi-Stats---All-Competitions'
row = field_player_scrap(184267, link, 'passing_df', tables_cols)

passing_df = passing_df.append(pd.Series(row, index=passing_df.columns), ignore_index=True)

In [92]:
passing_df

Unnamed: 0,sofifa_id,passes_pct,passes_completed,passes_total_distance,passes_progressive_distance,passes_pct_short,passes_pct_medium,passes_pct_long,assists,assisted_shots,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,progressive_passes
0,184267.0,81.0,201.0,3414.0,1044.0,83.8,86.6,81.8,4.0,7.0,34.0,9.0,1.0,35.0


# Goalkeeper scraper

In [67]:
op = webdriver.ChromeOptions()
op.add_argument('headless')

browser = webdriver.Chrome('/Users/eduardooportoalonso/Documents/Cursos/Ironhack/datamad1020/ironhack-projects/pipelines-project/chromedriver', options=op)

browser.get('https://fbref.com/en/players/7ba6d84e/all_comps/David-de-Gea-Stats---All-Competitions')

In [68]:
tables_cols = {'stats_keeper_ks_collapsed': ["save_pct", "clean_sheets_pct", "pens_att_gk", "pens_saved"],
               'stats_keeper_adv_ks_collapsed': ["goals_against_gk", "pens_allowed", "free_kick_goals_against_gk", "corner_kick_goals_against_gk", "own_goals_against_gk", "passes_pct_launched_gk", "pct_passes_launched_gk", "passes_length_avg_gk", "pct_goal_kicks_launched", "goal_kick_length_avg", "crosses_stopped_pct_gk", "def_actions_outside_pen_area_gk", "def_actions_outside_pen_area_per90_gk", "avg_distance_def_actions_gk"], 
               'all_stats_passing_ks_collapsed': ["passes_pct", "passes_total_distance", "passes_progressive_distance", "passes_pct_short", "passes_pct_medium", "passes_pct_long", "assists", "assisted_shots", "passes_into_final_third"],
               'stats_gca_ks_collapsed': ["sca", "sca_passes_live", "sca_passes_dead", "gca", "gca_passes_live", "gca_passes_dead"],
               'stats_defense_ks_collapsed': ["tackles", "tackles_won", "dribble_tackles_pct", "pressure_regain_pct", "blocks", "blocked_shots", "blocked_shots_saves", "blocked_passes", "interceptions", "tackles_interceptions", "clearances", "errors"],
               'stats_possession_ks_collapsed': ["touches", "touches_live_ball", "dribbles_completed_pct", "carries", "carry_distance", "passes_received_pct", "miscontrols", "dispossessed"],
               'stats_playing_time_ks_collapsed': ["points_per_match", "plus_minus"],
               'stats_misc_ks_collapsed': ["cards_yellow", "cards_red", "cards_yellow_red", "fouls", "fouled", "pens_conceded", "own_goals", "ball_recoveries", "aerials_won_pct"]}

In [69]:
# Dataframes for appending the values
goalkeeping_df = pd.DataFrame(columns=["save_pct", "clean_sheets_pct", "pens_att_gk", "pens_saved"])

adv_gkeeping_df = pd.DataFrame(columns=["goals_against_gk", "pens_allowed", "free_kick_goals_against_gk", "corner_kick_goals_against_gk", "own_goals_against_gk", "passes_pct_launched_gk", "pct_passes_launched_gk", "passes_length_avg_gk", "pct_goal_kicks_launched", "goal_kick_length_avg", "crosses_stopped_pct_gk", "def_actions_outside_pen_area_gk", "def_actions_outside_pen_area_per90_gk", "avg_distance_def_actions_gk"])

passing_df = pd.DataFrame(columns=["passes_pct", "passes_total_distance", "passes_progressive_distance", "passes_pct_short", "passes_pct_medium", "passes_pct_long", "assists", "assisted_shots", "passes_into_final_third"])

goal_shoot_c_df = pd.DataFrame(columns=["sca", "sca_passes_live", "sca_passes_dead", "gca", "gca_passes_live", "gca_passes_dead"])

defensive_df = pd.DataFrame(columns=["tackles", "tackles_won", "dribble_tackles_pct", "pressure_regain_pct", "blocks", "blocked_shots", "blocked_shots_saves", "blocked_passes", "interceptions", "tackles_interceptions", "clearances", "errors"])

posession_df = pd.DataFrame(columns=["touches", "touches_live_ball", "dribbles_completed_pct", "carries", "carry_distance", "passes_received_pct", "miscontrols", "dispossessed"])

playtime_df = pd.DataFrame(columns=["points_per_match", "plus_minus"])

misc_df = pd.DataFrame(columns=["cards_yellow", "cards_red", "cards_yellow_red", "fouls", "fouled", "pens_conceded", "own_goals", "ball_recoveries", "aerials_won_pct"])

In [70]:
cols_scraped = {'stats_keeper_ks_collapsed': [], 'stats_keeper_adv_ks_collapsed': [], 'all_stats_passing_ks_collapsed': [], 'stats_gca_ks_collapsed': [], 'stats_defense_ks_collapsed': [], 'stats_possession_ks_collapsed': [], 'stats_playing_time_ks_collapsed': [], 'stats_misc_ks_collapsed': []}

In [71]:
for t in tables_cols:
    # Searching the soup
    table = browser.find_element_by_id(t)
    selector = browser.execute_script("return arguments[0].innerHTML;", table)
    link = BeautifulSoup(selector, 'html.parser')
    # Empty list for placing the values to append to the dataframe
    values = []

    for col in tables_cols[t]:
        # Select the season 
        row_tag = [row for row in link.find_all('tr') if row.find('th', {'data-stat': 'season'}) != None and row.find('th', {'data-stat': 'season'}).text == '2018-2019'][0]
        try:
            values.append(float(row_tag.find('td', {'data-stat': col}).text))
        except:
            values.append(0.0)
    
    cols_scraped[t].append(values)
