In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

dfn = pd.read_csv('Football_Scouts_Database_Raw_Stats_Normalised_Floats.csv')

dfn = dfn.fillna(0) #replacing na with 0

Centre_forward_qualities=['Goals', 'Goals/90']
Winger_qualities=['Progressive Carries','Successful Take Ons','Touches in Attacking 3rd','Successful Take Ons/90','Touches in Attacking 3rd/90','Progressive Carries/90']
Attacking_mid_qualities=['Key Passes','Key Passes/90']
Central_mid_qualities=['Progressive Passes','Passes Completed','Progressive Passes/90','Passes Completed/90']
Defensive_mid_qualities=['Tackles Won','Interceptions','Ball Recoveries','Tackles Won/90','Interceptions/90','Ball Recoveries/90']
Wingback_qualities=['Tackles Won','Crosses into penalty area','Tackles Won/90','Crosses into penalty area/90']
Ballplaying_def_qualities=['% of Aerial Duels won','Shots Blocked','Clearances','Passes Completed','Shots Blocked/90','Clearances/90','Passes Completed/90']
Defensive_cb_qualities=['Shots Blocked','Clearances','% of Aerial Duels won','Shots Blocked/90','Clearances/90']

y = dfn[Attacking_mid_qualities] # target variable (needs to be maximised)
X = dfn.drop(['Player','Position'], axis=1) # removes non-integer values from dataframe as we cant put non-integer values into a regression model
X = X.drop(columns=y)

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
}

r2_values=[]

# fitting the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred) #evaluating the models
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: MSE={mse:.2f}, RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.2f}")
    r2_values.append(r2)


Linear Regression: MSE=0.00, RMSE=0.03, MAE=0.02, R2=0.78
Random Forest: MSE=0.00, RMSE=0.02, MAE=0.01, R2=0.86
XGBoost: MSE=0.00, RMSE=0.02, MAE=0.01, R2=0.89


In [3]:
# Linear regression model looks to be working the best with our dataset (for strikers). So we use it for the implementation
#Step 2: Finding the players who have the best stats leading to Goals/90

# find index of model with highest r2 value
best_model_index = r2_values.index(max(r2_values))

# fit best model on all data
if best_model_index == 0:
    lr = LinearRegression()
    lr.fit(X, y) # use transformed data
    coefficients = lr.coef_
    weighted_sum = np.dot(X, coefficients.T).sum(axis=1) # use transformed data
elif best_model_index == 1:
    rfr = RandomForestRegressor()
    rfr.fit(X, y) # use transformed data
    weighted_sum = rfr.predict(X) # use transformed data
    weighted_sum = np.sum(weighted_sum, axis=1)
else:
    gbr = XGBRegressor()
    gbr.fit(X, y) # use transformed data
    weighted_sum = gbr.predict(X) # use transformed data
    weighted_sum = np.sum(weighted_sum, axis=1)

df = pd.read_csv('Football_Scouts_Database_Raw_Stats.csv')
dfn['weighted_sum'] = weighted_sum
dfn_combined = dfn.groupby('Player').agg({'weighted_sum': 'mean'})
df['weighted_sum'] = df['Player'].map(dfn_combined['weighted_sum'])

if set(y.columns) == set(Centre_forward_qualities) or set(y.columns) == set(Winger_qualities):
  df_filtered=df[df['Position'].str.contains("FW")]

elif set(y.columns) == set(Attacking_mid_qualities) or set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Defensive_mid_qualities):
  df_filtered=df[df['Position'].str.contains("MF")]

elif set(y.columns) == set(Wingback_qualities) or set(y.columns) == set(Ballplaying_def_qualities) or set(y.columns) == set(Defensive_cb_qualities):
  df_filtered=df[df['Position'].str.contains("DF")]

df.drop_duplicates(subset=['Player'])

df_sorted = df_filtered.sort_values(by='weighted_sum', ascending=False)
df_sorted.to_csv('Top_Wingback.csv', index=False)
df_sorted

Unnamed: 0.1,Unnamed: 0,Player,Position,Age,Matches Played,Starts,90 mins played,Progressive Carries,Progressive Passes,Progressive Passes Recvd,...,Shots Blocked/90,Passes Blocked/90,Interceptions/90,Clearances/90,Touches/90,Touches in Attacking 3rd/90,Touches in Penalty Box/90,Penalty Kicks won/90,Ball Recoveries/90,weighted_sum
268,273,Bruno Fernandes,"MF,FW",27.0,37,37,36.8,89,263,231,...,0.135870,0.869565,0.679348,1.086957,65.407609,29.266304,4.048913,0.027174,6.032609,1.077514
573,584,Rémy Cabella,"MF,FW",32.0,32,28,28.1,113,230,242,...,0.177936,1.281139,0.604982,0.249110,72.241993,34.377224,4.839858,0.000000,4.875445,0.927590
1495,1528,Jonas Hofmann,"FW,MF",30.0,31,30,29.8,73,119,197,...,0.067114,1.140940,0.536913,0.704698,59.697987,24.597315,3.187919,0.067114,4.731544,0.915399
30,30,Kevin De Bruyne,MF,31.0,32,28,26.9,100,246,245,...,0.074349,0.929368,0.334572,0.371747,69.144981,38.698885,3.717472,0.074349,3.605948,0.897733
506,517,Benjamin Bourigeaud,"MF,FW",28.0,37,37,34.1,40,264,250,...,0.087977,1.026393,0.909091,0.938416,67.595308,26.510264,2.346041,0.000000,5.161290,0.896355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,2812,Marlos Moreno,"MF,DF",25.0,2,0,0.1,0,1,0,...,10.000000,0.000000,0.000000,0.000000,80.000000,10.000000,0.000000,0.000000,10.000000,-0.003119
2010,2050,Nicolai Rapp,"MF,DF",25.0,7,0,0.8,1,5,0,...,0.000000,0.000000,0.000000,1.250000,85.000000,10.000000,3.750000,0.000000,7.500000,-0.003891
264,269,Carlos Martín,"FW,MF",20.0,4,0,0.3,0,0,3,...,0.000000,0.000000,3.333333,0.000000,46.666667,16.666667,3.333333,0.000000,10.000000,-0.003961
2496,2541,Vladimír Darida,MF,31.0,3,0,0.3,1,0,1,...,0.000000,3.333333,0.000000,0.000000,66.666667,30.000000,0.000000,0.000000,3.333333,-0.004136


In [4]:
xg_df = pd.read_csv('Football_Scouts_Database_xG.csv')

if set(y.columns) == set(Centre_forward_qualities) or set(y.columns) == set(Winger_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("FW")]

elif set(y.columns) == set(Attacking_mid_qualities) or set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Defensive_mid_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("MF")]

elif set(y.columns) == set(Wingback_qualities) or set(y.columns) == set(Ballplaying_def_qualities) or set(y.columns) == set(Defensive_cb_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("DF")]

#mapping weighted sum values from df_sorted to xg_df_filtered
xg_df_filtered['weighted_sum']= xg_df_filtered['Unnamed: 0'].map(df_filtered.set_index('Unnamed: 0')['weighted_sum'])
xg_df_sorted = xg_df_filtered.sort_values(by='weighted_sum', ascending=False)
top_10_players= xg_df_sorted.head(10)
top_10_players

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xg_df_filtered['weighted_sum']= xg_df_filtered['Unnamed: 0'].map(df_filtered.set_index('Unnamed: 0')['weighted_sum'])


Unnamed: 0.1,Unnamed: 0,Player,Position,Age,90 mins played,Non Penalty xG,Non-penalty goals,Non-Penalty(xG-Goals),Expected Assists,Assists,xAG-Assists,npxGi-npGi,Non Penalty xG + Assisted Goals,xG/90,Expected Assisted Goals/90,Expected Goals+Assisted Goals/90,Non Penalty xG/90,Non Penalty xG+xAG/90,weighted_sum
268,273,Bruno Fernandes,"MF,FW",27.0,36.8,7.7,6,1.7,16.7,8,8.7,10.4,24.4,0.25,0.45,0.71,0.21,0.66,1.077514
573,584,Rémy Cabella,"MF,FW",32.0,28.1,8.3,7,1.3,10.7,10,0.7,2.0,19.0,0.3,0.38,0.68,0.3,0.68,0.92759
1495,1528,Jonas Hofmann,"FW,MF",30.0,29.8,6.6,12,-5.4,11.1,9,2.1,-3.3,17.6,0.24,0.37,0.61,0.22,0.59,0.915399
30,30,Kevin De Bruyne,MF,31.0,26.9,5.6,7,-1.4,13.7,16,-2.3,-3.7,19.3,0.21,0.51,0.72,0.21,0.72,0.897733
506,517,Benjamin Bourigeaud,"MF,FW",28.0,34.1,7.1,5,2.1,11.6,10,1.6,3.7,18.7,0.26,0.34,0.6,0.21,0.55,0.896355
80,81,Lionel Messi,"FW,MF",35.0,31.5,15.5,16,-0.5,13.4,16,-2.6,-3.1,28.9,0.49,0.43,0.92,0.49,0.92,0.857905
1698,1733,Franck Honorat,"FW,MF",25.0,28.2,4.8,6,-1.2,9.4,5,4.4,3.2,14.2,0.17,0.33,0.5,0.17,0.5,0.817888
1524,1558,Branco van den Boomen,MF,27.0,32.1,2.9,1,1.9,8.1,8,0.1,2.0,10.9,0.19,0.25,0.44,0.09,0.34,0.808928
8,8,Piotr Zieliński,MF,29.0,25.4,3.5,3,0.5,7.4,8,-0.6,-0.1,10.9,0.14,0.29,0.43,0.14,0.43,0.786023
157,161,Joshua Kimmich,MF,27.0,31.2,1.4,4,-2.6,7.0,6,1.0,-1.6,8.4,0.07,0.22,0.3,0.05,0.27,0.773085


In [5]:
from scipy.spatial.distance import cdist

if set(y.columns) == set(Centre_forward_qualities):
  target_variable='Non-Penalty(xG-Goals)'
elif set(y.columns) == set(Winger_qualities) or set(y.columns) == set(Attacking_mid_qualities):
  target_variable='npxGi-npGi'
elif set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Wingback_qualities):
  target_variable='xAG-Assists'
else:
  print('End')

xg_df_sorted_numeric = xg_df_sorted.select_dtypes(include='number')
top_10_avg = xg_df_sorted_numeric.head(10).mean()

# Calculate the Euclidean and Manhattan distances for each player to the average
euclidean_distances = cdist(xg_df_sorted_numeric, top_10_avg.to_frame().T, metric='minkowski', p=2)
manhattan_distances = cdist(xg_df_sorted_numeric, top_10_avg.to_frame().T, metric='minkowski', p=1)
xg_df_sorted['Euclidean Distance'] = euclidean_distances[:,0]
xg_df_sorted['Manhattan Distance'] = manhattan_distances[:,0]

# Normalize the Euclidean and Manhattan distances and the 'xAG- Assists' values
xg_df_sorted['Euclidean Distance (Normalized)'] = 1- (xg_df_sorted['Euclidean Distance'] - xg_df_sorted['Euclidean Distance'].min()) / (xg_df_sorted['Euclidean Distance'].max() - xg_df_sorted['Euclidean Distance'].min())
xg_df_sorted['Manhattan Distance (Normalized)'] = 1- (xg_df_sorted['Manhattan Distance'] - xg_df_sorted['Manhattan Distance'].min()) / (xg_df_sorted['Manhattan Distance'].max() - xg_df_sorted['Manhattan Distance'].min())
xg_df_sorted['Target Variable (Normalized)'] = (xg_df_sorted[target_variable] - xg_df_sorted[target_variable].min()) / (xg_df_sorted[target_variable].max() - xg_df_sorted[target_variable].min())

# Calculate the weighted average of the normalized values
weights = [0.25, 0.25, 0.5]
xg_df_sorted['Combined Score'] = xg_df_sorted[['Euclidean Distance (Normalized)', 'Manhattan Distance (Normalized)', 'Target Variable (Normalized)']].dot(weights)

# Sort rows by combined score in ascending order
sorted_df_xg= xg_df_sorted.sort_values(by='Combined Score', ascending=False)

#sorted_df_xg.to_csv('Top_Wingback_xg.csv', index=False)

sorted_df_xg

Unnamed: 0.1,Unnamed: 0,Player,Position,Age,90 mins played,Non Penalty xG,Non-penalty goals,Non-Penalty(xG-Goals),Expected Assists,Assists,...,Expected Goals+Assisted Goals/90,Non Penalty xG/90,Non Penalty xG+xAG/90,weighted_sum,Euclidean Distance,Manhattan Distance,Euclidean Distance (Normalized),Manhattan Distance (Normalized),Target Variable (Normalized),Combined Score
268,273,Bruno Fernandes,"MF,FW",27.0,36.8,7.7,6,1.7,16.7,8,...,0.71,0.21,0.66,1.077514,374.678256,418.836672,0.930934,0.928782,1.000000,0.964929
346,354,Cengiz Ünder,"MF,DF",25.0,27.7,7.5,4,3.5,7.9,4,...,0.59,0.27,0.56,0.518257,293.537806,327.770585,0.946624,0.946166,0.852217,0.899306
705,720,Lorenzo Pellegrini,MF,26.0,28.3,5.7,2,3.7,6.9,5,...,0.53,0.20,0.44,0.653884,73.620675,106.472958,0.989150,0.988411,0.763547,0.876164
761,777,Alexis Mac Allister,MF,23.0,32.1,7.3,4,3.3,4.3,2,...,0.51,0.23,0.36,0.433610,130.536713,171.415232,0.978144,0.976014,0.763547,0.870313
821,839,Emi Buendía,"MF,FW",25.0,26.8,6.3,5,1.3,5.7,2,...,0.45,0.24,0.45,0.383964,192.143303,228.224878,0.966231,0.965169,0.733990,0.849845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5168,5258,Alfonso González,"MF,FW",27.0,25.3,7.9,10,-2.1,4.9,8,...,0.50,0.31,0.50,0.582505,4610.717632,4647.906337,0.111796,0.121456,0.231527,0.174077
5397,5493,Leonardo Suárez,MF,26.0,8.3,2.1,6,-3.9,1.6,2,...,0.44,0.25,0.44,0.203475,4845.791623,4917.965367,0.066339,0.069902,0.275862,0.171991
5412,5508,Uriel Antuna,"FW,MF",24.0,28.9,4.4,8,-3.6,3.3,4,...,0.27,0.15,0.27,0.352385,4860.726382,4905.318457,0.063451,0.072317,0.275862,0.171873
5421,5517,Rodrigo Huescas,"DF,MF",18.0,15.4,2.3,4,-1.7,2.2,5,...,0.29,0.15,0.29,0.132686,4869.769856,4941.168156,0.061702,0.065473,0.266010,0.164799
