In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [51]:
vague_position_model_data = pd.read_csv('vague_position_model_data.csv')

In [52]:
vague_position_model_data.head(20)

Unnamed: 0,Age,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,...,TOTY_Nominee,League_bundesliga,League_laliga,League_ligue-1,League_premier-league,League_serie-a,Position_Attacker,Position_Defender,Position_Goalkeeper,Position_Midfielder
0,26,52,49,0,0,3,0,0,0,0,...,0,False,True,False,False,False,False,False,True,False
1,32,51,11,0,0,1,0,0,0,0,...,0,False,True,False,False,False,False,False,True,False
2,21,51,23,0,0,3,0,1,7,0,...,0,False,True,False,False,False,False,True,False,False
3,21,50,45,3,1,7,2,0,1,1,...,0,False,True,False,False,False,False,True,False,False
4,29,41,37,3,3,7,1,0,1,8,...,0,False,True,False,False,False,False,True,False,False
5,30,49,45,1,3,8,0,0,4,6,...,1,False,True,False,False,False,False,True,False,False
6,34,49,30,1,2,0,0,0,8,4,...,0,False,True,False,False,False,False,True,False,False
7,25,54,54,5,14,13,0,0,2,4,...,0,False,True,False,False,False,False,True,False,False
8,28,39,32,0,1,6,0,2,1,2,...,0,False,True,False,False,False,False,True,False,False
9,19,53,41,3,1,12,0,0,10,8,...,0,False,True,False,False,False,False,False,False,True


In [53]:
#Probability Classifier
# Define the cutoff for training and test data
train_cutoff = season_mapping['2019/2020']
test_season = season_mapping['2023/2024']

# Separate the data into training and test sets
train_data = vague_position_model_data[vague_position_model_data['Season'] <= train_cutoff]
test_data = vague_position_model_data[vague_position_model_data['Season'] == test_season]

# Drop the TOTY and TOTY_Nominee columns, as instructed
X_train = train_data.drop(columns=['TOTY', 'TOTY_Nominee'])
y_train = train_data['TOTY_Nominee']
X_test = test_data.drop(columns=['TOTY', 'TOTY_Nominee'])
y_test = test_data['TOTY_Nominee']

print("Training data and test data separated successfully.")


# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model trained successfully.")

# Evaluate the model
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_precision = precision_score(y_train, y_train_pred)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f'Train Accuracy: {train_accuracy:.2f}, Precision: {train_precision:.2f}, Recall: {train_recall:.2f}, F1 Score: {train_f1:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}, Precision: {test_precision:.2f}, Recall: {test_recall:.2f}, F1 Score: {test_f1:.2f}')

# Predict probabilities on the test set
probabilities = model.predict_proba(X_test_scaled)[:, 1]  # Get the probabilities of class 1

# Assuming one-hot encoded position columns
position_columns = ['Position_Goalkeeper', 'Position_Defender', 'Position_Midfielder', 'Position_Attacker']

# Combine probabilities with the original test data to find the positions
test_data['Probability'] = probabilities

# Define the top N for each position category
top_n = {
    'Goalkeeper': 1,
    'Defender': 4,
    'Midfielder': 3,
    'Attacker': 3
}

# Function to print top N players for a given one-hot encoded position
def print_top_n_for_position(position_prefix, n):
    filtered_data = test_data[test_data[position_prefix] == 1]
    top_n_players = filtered_data.nlargest(n, 'Probability')
    print(f"\nTop {n} {position_prefix.replace('Position_', '')}s:")
    print(top_n_players[['Probability', 'Appearances', 'Goals', 'Assists', 'Placement']])

# Print the top probabilities for each position
for position, n in top_n.items():
    position_prefix = f'Position_{position}'
    print_top_n_for_position(position_prefix, n)


Training data and test data separated successfully.
Model trained successfully.
Train Accuracy: 0.98, Precision: 0.65, Recall: 0.24, F1 Score: 0.36
Test Accuracy: 0.97, Precision: 0.50, Recall: 0.07, F1 Score: 0.12

Top 1 Goalkeepers:
      Probability  Appearances  Goals  Assists  Placement
6466     0.444931           36      0        0          2

Top 4 Defenders:
       Probability  Appearances  Goals  Assists  Placement
6447      0.662148           48      2        3          1
19032     0.653132           51     12       20          1
6451      0.576436           41      6        5          1
6453      0.437400           45      0        1          1

Top 3 Midfielders:
       Probability  Appearances  Goals  Assists  Placement
6454      0.787075           42     23       13          1
13215     0.721335           50      9       14          1
6455      0.460035           54      3        8          1

Top 3 Attackers:
       Probability  Appearances  Goals  Assists  Placement
132

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Probability'] = probabilities


In [54]:
comprehensive_vague_positions_df = pd.read_csv('comprehensive_vague_positions_df.csv')

In [57]:
# Function to print top N players for a given one-hot encoded position
def print_top_n_for_position(position_prefix, n):
    filtered_data = test_data[test_data[position_prefix] == 1]
    top_n_players = filtered_data.nlargest(n, 'Probability')
    print(f"\nTop {n} {position_prefix.replace('Position_', '')}s:")
    for _, row in top_n_players.iterrows():
        players = find_players(row['Appearances'], row['Goals'], row['Assists'], row['Placement'], '2023/2024', comprehensive_vague_positions_df)
        if not players.empty:
            for _, player_row in players.iterrows():
                print(f"Player: {player_row['Player']}, Probability: {row['Probability']:.2f}, Appearances: {row['Appearances']}, Goals: {row['Goals']}, Assists: {row['Assists']}, Placement: {row['Placement']}")

# Print the top probabilities for each position
for position, n in top_n.items():
    position_prefix = f'Position_{position}'
    print_top_n_for_position(position_prefix, n)


Top 1 Goalkeepers:
Player: marc-andre ter stegen, Probability: 0.44, Appearances: 36, Goals: 0, Assists: 0, Placement: 2

Top 4 Defenders:
Player: antonio rudiger, Probability: 0.66, Appearances: 48, Goals: 2, Assists: 3, Placement: 1
Player: alejandro grimaldo, Probability: 0.65, Appearances: 51, Goals: 12, Assists: 20, Placement: 1
Player: daniel carvajal, Probability: 0.58, Appearances: 41, Goals: 6, Assists: 5, Placement: 1
Player: nacho fernandez, Probability: 0.44, Appearances: 45, Goals: 0, Assists: 1, Placement: 1

Top 3 Midfielders:
Player: jude bellingham, Probability: 0.79, Appearances: 42, Goals: 23, Assists: 13, Placement: 1
Player: rodri, Probability: 0.72, Appearances: 50, Goals: 9, Assists: 14, Placement: 1
Player: federico valverde, Probability: 0.46, Appearances: 54, Goals: 3, Assists: 8, Placement: 1

Top 3 Attackers:
Player: erling haaland, Probability: 0.70, Appearances: 45, Goals: 38, Assists: 6, Placement: 1
Player: vinicius junior, Probability: 0.67, Appearance

In [41]:
# Function to find players based on input criteria
def find_players(appearances, goals, assists, placement, season, df):
    # Filter the DataFrame based on the input criteria
    filtered_df = df[
        (df['Appearances'] == appearances) & 
        (df['Goals'] == goals) & 
        (df['Assists'] == assists) & 
        (df['Placement'] == placement) & 
        (df['Season'] == season)
    ]
    # Return the filtered DataFrame with all original columns
    return filtered_df

# Example usage
example_appearances = 54
example_goals = 3
example_assists = 8
example_placement = 1
example_season = '2023/2024'

result = find_players(example_appearances, example_goals, example_assists, example_placement, example_season, comprehensive_vague_positions_df)
result

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
10140,federico valverde,24,Midfielder,15,Uruguay,55,54,3,8,2,...,4280.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,1


In [None]:
#Add a function that will go over your results and search them so you automatically get the names