In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
from splinter import Browser
import time

In [2]:
teams_df = pd.read_csv('../Resources/all_premier_league_teams.csv')
teams_df.head()

# Extracting the ID from the 'link' column in the DataFrame
teams_df['link'] = teams_df['link'].apply(lambda x: x.split('/')[-3])

# Display the updated DataFrame
print(teams_df)


               team      link
0   Manchester City  b8fd03ef
1    Manchester Utd  19538871
2         Tottenham  361ca564
3         Liverpool  822bd0ba
4           Chelsea  cff3d9bb
5           Arsenal  18bb7c10
6           Burnley  943e8050
7           Everton  d3fd31cc
8    Leicester City  a2d435b3
9     Newcastle Utd  b2b47a98
10   Crystal Palace  47c64c55
11      Bournemouth  4ba7cbea
12         West Ham  7c21e445
13          Watford  2abfe087
14         Brighton  d07537b9
15     Huddersfield  f5922ca5
16      Southampton  33c895d4
17     Swansea City  fb10988f
18       Stoke City  17892952
19        West Brom  60c6b05f
20           Wolves  8cec06e1
21     Cardiff City  75fae011
22           Fulham  fd962109
23    Sheffield Utd  1df6b87e
24      Aston Villa  8602292d
25     Norwich City  1c781004
26     Leeds United  5bfb9659
27        Brentford  cd051869
28  Nott'ham Forest  e4a775cb
29       Luton Town  e297cd13


In [4]:
# Prompt user for the common year range
year_i = input('Enter the starting year (YYYY format): ')
year_f = input('Enter the ending year (type in same year for single year analysis): ')

# Convert user inputs to integers
year_i = int(year_i)
year_f = int(year_f)

# Create a Chrome browser instance
browser = Browser('chrome')

# Iterate through each team in the DataFrame using the common year range
for index, row in teams_df.iterrows():
    team = row['team']
    link = row['link']

    base_url = 'https://fbref.com'
    team_url = team.replace(' ', '-')

    # Generate the range of years based on the common year range
    years = range(year_i, year_f + 1)

    combined_data = []

    # Visit each year's URL for the team
    for year in years:
        url = f'{base_url}/en/squads/{link}/{year}-{year + 1}/{team_url}-Stats'
        
        # Print the generated URLs for each team
        print(f'Team: {team}, URLs: {url}')
        
        # Visit the page and create the soup object
        browser.visit(url)
        time.sleep(5)
        browser.is_element_present_by_id('stats_shooting_9', wait_time=5)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        
        # Scrape the offensive table from the page
        try:
            table = soup.find('table', {'class': 'stats_table sortable min_width now_sortable sticky_table eq1 re1 le1', 
                                        'id': 'stats_shooting_9'})

            # Execute JavaScript to stop further page loading
            browser.execute_script("window.stop();")

            # Extract data from the table
            data_list = []

            # Iterate through the rows in the table body
            for row in table.find('tbody').find_all('tr'):
                # Extract the 'Date' from the <th> with scope 'row' and class 'left'
                player_elem = row.find('th', {'scope': 'row', 'class': 'left'})
                player = player_elem.get_text() if player_elem else None

                # Other columns extracted from <td> elements
                columns = [cell.get_text() for cell in row.find_all('td')]

                # Insert 'date' into the beginning of the columns list if it exists
                if player_elem:
                    columns.insert(0, player)

                # Append the row data to the list
                data_list.append(columns)

            # Append the scraped data to combined_data
            combined_data.extend(data_list)
            
        except AttributeError:
            # If 'stats_standard_9' ID isn't found, print a message and continue
            print(f"No 'stats_shooting_9' table found for {team} in the specified year range.")
            continue

    # Define columns including 'Notes'
    columns = ['Player', 'Nation', 'Pos', 'Age', '90s', 'Gls', 'Sh', 'SoT', 'Sot%', 'sh/90', 'SoT/90',
              'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK', 'PKat', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG','Matches'
]

    # Create a Pandas DataFrame
    team_df = pd.DataFrame(combined_data, columns=columns)

    # Save the data for each team to a separate CSV file
    # Assuming 'team_df' contains the scraped and processed data for each team
    team_df.to_csv(f'../Resources/Shooting_Stats/{team}_shooting_stats.csv', index=False)

Enter the starting year (YYYY format): 2017
Enter the ending year (type in same year for single year analysis): 2023
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2017-2018/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2018-2019/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2019-2020/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2020-2021/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2022-2023/Manchester-City-Stats
Team: Manchester City, URLs: https://fbref.com/en/squads/b8fd03ef/2023-2024/Manchester-City-Stats
Team: Manchester Utd, URLs: https://fbref.com/en/squads/19538871/2017-2018/Manchester-Utd-Stats
Team: Manchester Utd, URLs: https://fbref.com/en/squads/19538871/2018-2019/Manchester-Utd-Stats
Team:

Team: West Ham, URLs: https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-Stats
Team: West Ham, URLs: https://fbref.com/en/squads/7c21e445/2022-2023/West-Ham-Stats
Team: West Ham, URLs: https://fbref.com/en/squads/7c21e445/2023-2024/West-Ham-Stats
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2017-2018/Watford-Stats
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2018-2019/Watford-Stats
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2019-2020/Watford-Stats
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2020-2021/Watford-Stats
No 'stats_shooting_9' table found for Watford in the specified year range.
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2021-2022/Watford-Stats
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2022-2023/Watford-Stats
No 'stats_shooting_9' table found for Watford in the specified year range.
Team: Watford, URLs: https://fbref.com/en/squads/2abfe087/2023-2024/Watford-Stats
No 'stats_shooting_9' ta

OSError: Cannot save file into a non-existent directory: '..\Shooting_Stats'

In [None]:
# Quit the browser session
browser.quit()