In [9]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Specify the URL of the website
url = "https://fbref.com/en/comps/9/stats/Premier-League-Stats"  # Replace with your URL

# Start a headless browser session
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run the browser in headless mode
driver = webdriver.Chrome(options=options)

# Navigate to the webpage
driver.get(url)

# Wait for the table to load (adjust the time as needed)
time.sleep(5)  # You may need to increase or decrease this wait time

# Get the page source after the table has loaded
page_source = driver.page_source

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Locate the specific table you want to scrape
table = soup.find('table', id = 'stats_standard', class_ = 'min_width sortable stats_table shade_zero now_sortable sticky_table eq2 re2 le2')  # Adjust the identifier as needed

# Extract both headers and data within a single loop
data = []
data_rows = table.find_all('tr', {'data-row': True})
for row in data_rows:
    cells = row.find_all(['th', 'td'], {'data-stat': True})
    ind_data = [cell.text.strip() for cell in cells]
    data.append(ind_data)
    headers = [cell['data-stat'] for cell in cells]

# Create dataframe and drop last 'Matches' column
player_stats = pd.DataFrame(data, columns=headers)
player_stats = player_stats.drop(player_stats.columns[-1], axis=1)

# Remove rows containing headers 'Rk'
player_stats = player_stats[player_stats['ranker'] != 'Rk']

# Reset the index to consecutive integers
player_stats.reset_index(drop=True, inplace=True)

# Convert position to string and split multiples positions
player_stats['position'] = player_stats['position'].astype('string').str.split(",")

# Convert age string to float
player_stats['age'] = player_stats['age'].apply(lambda x: float(x.split('-')[0]) + round(float(x.split('-')[1]) / 365, 3))

# Replace empty strings with float 0
player_stats = player_stats.applymap(lambda x: 0 if x == '' else x)

# # Convert data strings to float
player_stats.iloc[:, 6:] = player_stats.iloc[:, 6:].astype(float)

# Close the browser
driver.quit()

# # Save the DataFrame to an Excel file using XlsxWriter as the engine
# player_stats.to_excel('FBREF prem stats.xlsx', engine='xlsxwriter', index=False)

In [10]:
# Correlation matrix to detect correlation between features
def cross_corr_mean(df_input, corr_coeff=0.80, plot=0):
    """ The function retuns a list of features to be dropped from the input features. """
    # Generating correlation matrix of input features
    corr_matrix = df_input.corr(method = 'pearson') # For more info on the methods please refer to https://www.kaggle.com/kiyoung1027/correlation-pearson-spearman-and-kendall

    # Plotting cross correlation matrix
    if plot == 1:
        # Create a heatmap of the correlation matrix
        plt.figure(figsize=(36, 24))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
        plt.title('Correlation Matrix Heatmap (PCA)')
        plt.show()

    # Generating mean correlation with all features
    corr_mean = abs(corr_matrix).mean()

    # Preparing data
    features_drop_list = [] # This will contain the list of features to be dropped
    features_index_drop_list = [] # This will contain the index of features to be dropped as per df_input
    corr_matrix = abs(corr_matrix)

    # Selecting features to be dropped (Using two for loops that runs on one triangle of the corr_matrix to avoid checking the correlation of a variable with itself)
    for i in range(corr_matrix.shape[0]):
        for j in range(i+1,corr_matrix.shape[0]):

            # The following if statement checks if each correlation value is higher than threshold (or equal) and also ensures the two columns have NOT been dropped already.  
            if corr_matrix.iloc[i,j]>=corr_coeff and i not in features_index_drop_list and j not in features_index_drop_list:

                # The following if statement checks which of the 2 variables with high correlation has a lower correlation with all features and then drops it. If equal we can drop any and it drops the first one (This is arbitrary)
                if corr_mean[corr_matrix.columns[i]] >= corr_mean[corr_matrix.columns[j]]:
                    features_drop_list.append(corr_matrix.columns[i])	# Name of variable that needs to be dropped appended to list
                    features_index_drop_list.append(i)	# Index of variable that needs to be dropped appended to list. This is used to not check for the same variables repeatedly
                else:
                    features_drop_list.append(corr_matrix.columns[j])
                    features_index_drop_list.append(j)

    return features_drop_list

In [16]:
player_stats.columns

Index(['ranker', 'player', 'nationality', 'position', 'team', 'age',
       'birth_year', 'games', 'games_starts', 'minutes', 'minutes_90s',
       'goals', 'assists', 'goals_assists', 'goals_pens', 'pens_made',
       'pens_att', 'cards_yellow', 'cards_red', 'xg', 'npxg', 'xg_assist',
       'npxg_xg_assist', 'progressive_carries', 'progressive_passes',
       'progressive_passes_received', 'goals_per90', 'assists_per90',
       'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90',
       'xg_per90', 'xg_assist_per90', 'xg_xg_assist_per90', 'npxg_per90',
       'npxg_xg_assist_per90'],
      dtype='object')

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px
from sklearn.preprocessing import StandardScaler

# Cluster analysis - midfielders:
mid = player_stats[player_stats['position'].apply(lambda x: 'MF' in x)]

# Step 1: Extract numeric columns
num_cols = mid.select_dtypes(include='float64').columns
data_num = mid[num_cols]

# Standardize using scaler
scaler = StandardScaler()
data_scal = pd.DataFrame(scaler.fit_transform(data_num), columns = data_num.columns)

# # Create a heatmap of the correlation matrix
# plt.figure(figsize=(36, 24))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
# plt.title('Correlation Matrix Heatmap (PCA)')
# plt.show()

# Step 2: Identify groups of highly correlated features
# Drop columns with large correlation
# You can define a threshold (e.g., 0.80) to consider features as highly correlated
# threshold = 0.80
# corr_groups = {}  # Dictionary to store groups of correlated features

# for feature in corr_matrix.columns:
#     corr_feat = corr_matrix.index[corr_matrix[feature] > threshold].tolist()
#     if corr_feat:
#         corr_feat.append(feature)  # Include the original feature
#         corr_groups[feature] = corr_feat

# # Step 3: Choose one representative feature from each group
# # You can select a feature based on domain knowledge or a specific criterion
# repr_feat = []
# for group in corr_groups.values():
#     # Here, we choose the first feature in each group as the representative
#     repr_feat.append(group[0])

# # Step 4: Create a new DataFrame with representative features
# filter_data = data_scal[repr_feat]
# print(filter_data)

# data

In [19]:
drop_list = cross_corr_mean(mid, 0.8, 0)
print(drop_list)
mid_pca = mid.drop(columns=drop_list)
mid_pca
mid

['age', 'games', 'minutes', 'minutes_90s', 'goals_assists', 'goals', 'pens_att', 'xg', 'npxg_xg_assist', 'progressive_passes_received', 'goals_per90', 'goals_assists_per90', 'xg_xg_assist_per90', 'xg_per90', 'npxg_xg_assist_per90']


Unnamed: 0,ranker,player,nationality,position,team,age,birth_year,games,games_starts,minutes,...,goals_per90,assists_per90,goals_assists_per90,goals_pens_per90,goals_assists_pens_per90,xg_per90,xg_assist_per90,xg_xg_assist_per90,npxg_per90,npxg_xg_assist_per90
1,2,Bénie Adama Traore,ci CIV,"[FW, MF]",Sheffield Utd,20.940,2002.0,6.0,3.0,325.0,...,0.00,0.00,0.00,0.00,0.00,0.07,0.13,0.20,0.07,0.20
4,5,Simon Adingra,ci CIV,"[FW, MF]",Brighton,21.852,2002.0,10.0,6.0,576.0,...,0.31,0.16,0.47,0.31,0.47,0.15,0.20,0.35,0.15,0.35
6,7,Naouirou Ahamada,fr FRA,"[MF, FW]",Crystal Palace,21.614,2002.0,5.0,0.0,80.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
8,9,Ola Aina,ng NGA,"[DF, MF]",Nott'ham Forest,27.085,1996.0,8.0,7.0,553.0,...,0.16,0.00,0.16,0.16,0.16,0.02,0.04,0.06,0.02,0.06
9,10,Rayan Aït Nouri,dz ALG,"[MF, DF]",Wolves,22.425,2001.0,11.0,10.0,784.0,...,0.00,0.00,0.00,0.00,0.00,0.13,0.05,0.18,0.13,0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,459,Cauley Woodrow,eng ENG,"[FW, MF]",Luton Town,28.934,1994.0,7.0,0.0,81.0,...,0.00,0.00,0.00,0.00,0.00,0.68,0.39,1.07,0.68,1.07
460,461,Yehor Yarmoliuk,ua UKR,[MF],Brentford,19.690,2004.0,3.0,0.0,42.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
461,462,Ryan Yates,eng ENG,[MF],Nott'ham Forest,25.964,1997.0,9.0,3.0,366.0,...,0.00,0.00,0.00,0.00,0.00,0.02,0.03,0.04,0.02,0.04
464,465,Nicolò Zaniolo,it ITA,"[FW, MF]",Aston Villa,24.353,1999.0,9.0,6.0,424.0,...,0.00,0.00,0.00,0.00,0.00,0.31,0.16,0.46,0.31,0.46


In [15]:
import plotly.express as px

px.scatter(player_stats[player_stats['minutes'] >= 500], x='xg_per90', y='xg_assist_per90', hover_name='player', title='Goal contributions')

In [13]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

# Standardize the features (mean=0, std=1)
# Step 1: Extract numeric columns
numeric_columns = mid.select_dtypes(include='float64').columns
data_num = mid[numeric_columns]

# Variance threshold to remove any features that have little to no variation in their values
selector = VarianceThreshold(threshold=0)
selected_features = selector.fit_transform(data_num)

# Specify `indices=True` to get indices of selected features
# Use indices to get the corresponding column names of selected features
num_cols = list(data_num.columns[selector.get_support(indices=True)])
# Subset `data` to retain only selected features
data = data_num[num_cols]

# Standardize using scaler
scaler = StandardScaler()
data_scal = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# Calculate the correlation matrix for the principal components
corr_matrix = data_scal.corr(method='pearson')

# Step 2: Identify groups of highly correlated features
# You can define a threshold (e.g., 0.80) to consider features as highly correlated
threshold = 0.80
corr_groups = {}  # Dictionary to store groups of correlated features

for feature in corr_matrix.columns:
    corr_feat = corr_matrix.index[corr_matrix[feature] > threshold].tolist()
    if corr_feat:
        corr_feat.append(feature)  # Include the original feature
        corr_groups[feature] = corr_feat

# Step 3: Choose one representative feature from each group
# You can select a feature based on domain knowledge or a specific criterion
repr_feat = []
for group in corr_groups.values():
    # Here, we choose the first feature in each group as the representative
    repr_feat.append(group[0])

# Step 4: Create a new DataFrame with representative features
filter_data = data_scal[repr_feat]
print(filter_data)

scaler = StandardScaler()
data_scal = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# Step 5: Apply PCA to the filtered data to reduce dimensionality
pca = PCA(n_components='mle')
pca_result = pca.fit_transform(data_scal)
pca_df = pd.DataFrame(pca.components_,columns=data_scal.columns)

# Perform Affinity Propagation clustering
affinity_propagation = AffinityPropagation(damping=0.9)
cluster_labels = affinity_propagation.fit_predict(pca_df)

# Add the cluster labels to the DataFrame
pca_df['Cluster'] = cluster_labels

# Visualize the clustered data (in reduced dimension) with Plotly Express
fig = px.scatter(data, x=pca_df[:, 0], y=pca_df[:, 1], color=cluster_labels,
                 title='Affinity Propagation Clustering')
fig.update_xaxes(title_text='Principal Component 1')
fig.update_yaxes(title_text='Principal Component 2')
fig.show()

InvalidIndexError: (slice(None, None, None), 0)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

# Standardize the features (mean=0, std=1)
# Step 1: Extract numeric columns
numeric_columns = mid.select_dtypes(include='float64').columns
data_num = mid[numeric_columns]

# Variance threshold to remove any features that have little to no variation in their values
selector = VarianceThreshold(threshold=0)
selected_features = selector.fit_transform(data_num)

# Specify `indices=True` to get indices of selected features
# Use indices to get the corresponding column names of selected features

num_cols = list(data_num.columns[selector.get_support(indices=True)])

# Subset `data` to retain only selected features
data = data_num[num_cols]

# Standardize using scaler
scaler = StandardScaler()
data_scal = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# Calculate the correlation matrix for the principal components
corr_matrix_scal = data_scal.corr(method='pearson')

# Step 2: Identify groups of highly correlated features
# You can define a threshold (e.g., 0.80) to consider features as highly correlated
threshold = 0.80
corr_groups = {}  # Dictionary to store groups of correlated features

for feature in corr_matrix_scal.columns:
    correlated_features = corr_matrix_scal.index[corr_matrix_scal[feature] > threshold].tolist()
    if correlated_features:
        correlated_features.append(feature)  # Include the original feature
        corr_groups[feature] = correlated_features

# Step 3: Choose one representative feature from each group
# You can select a feature based on domain knowledge or a specific criterion
representative_features = []
for group in corr_groups.values():
    # Here, we choose the first feature in each group as the representative
    representative_features.append(group[0])

# Step 4: Create a new DataFrame with representative features
filter_data = data_scal[representative_features]
print(filter_data)

# # Create a DataFrame from the correlation matrix with feature names
# feature_names = data.columns
# correlation_df_scal = pd.DataFrame(corr_matrix_scal, columns=feature_names, index=feature_names)

# # Create a heatmap of the correlation matrix
# plt.figure(figsize=(36, 24))
# sns.heatmap(correlation_df_scal, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
# plt.title('Correlation Matrix Heatmap (PCA)')
# plt.show()



          age  birth_year     games  games_starts     games     games  \
0   -1.275369    1.153924 -0.118688     -0.152130 -0.118688 -0.118688   
1   -1.033737    1.153924  0.722186     -0.152130  0.722186  0.722186   
2   -1.096990    1.153924 -0.959562     -1.156190 -0.959562 -0.959562   
3    0.351214   -0.424070 -0.118688      0.517243 -0.118688 -0.118688   
4   -0.882353    0.890925  1.142624      1.186617  1.142624  1.142624   
..        ...         ...       ...           ...       ...       ...   
204  0.840565   -0.950068  0.301749     -1.156190  0.301749  0.301749   
205  0.054533   -0.161071  0.301749     -0.152130  0.301749  0.301749   
206 -0.371829    0.364927  0.301749     -0.152130  0.301749  0.301749   
207 -0.729381    0.627926 -0.959562     -1.156190 -0.959562 -0.959562   
208 -0.228915    0.101928  1.142624      1.521303  1.142624  1.142624   

        goals   assists     goals     goals  ...  goals_per90  assists_per90  \
0   -0.540978 -0.469521 -0.540978 -0.540978