In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
club_player_data = pd.read_csv("club_players.csv")
club_data = pd.read_csv("club.csv").loc[:, ["Season", "ClubID", "avgMarketValue", "League", "avgAge"]]
player_data = pd.read_csv("players_data.csv").loc[: ,["player_id", "given_name", "date_of_birth", "caps", "height", "goals"]]

print(club_player_data.head())
print(club_data.head())
print(player_data.head())

   Unnamed: 0          League  Season  ClubID       Player_name  PlayerID  \
0           0  Premier League    2015     631  Thibaut Courtois    108390   
1           1  Premier League    2015     631     Asmir Begovic     33873   
2           2  Premier League    2015     631    Jamal Blackman    128898   
3           3  Premier League    2015     631      Marco Amelia     16810   
4           4  Premier League    2015     631        Kurt Zouma    157509   

  Player_MarketValue Player_possition  
0           €35.00m        Goalkeeper  
1           €12.00m        Goalkeeper  
2             €250k        Goalkeeper  
3             €250k        Goalkeeper  
4           €17.00m       Centre-Back  
   Season  ClubID avgMarketValue          League  avgAge
0    2015     631        €13.84m  Premier League    24.7
1    2015     281        €13.14m  Premier League    25.4
2    2015      11        €11.17m  Premier League    24.7
3    2015     985        €10.00m  Premier League    23.5
4    2015   

In [3]:
league_to_id = {l:i for i, l in enumerate(club_player_data["League"])}
position_to_id = {p:i for i, p in enumerate(club_player_data["Player_possition"])}
player_birth = {}
for _, r in player_data.iterrows():
    try:
        player_birth[r["player_id"]] = int(r["date_of_birth"].split(",")[1].replace(" ", ""))
    except Exception as e:
        continue

player_height = {r["player_id"]:r["height"] for _, r in player_data.iterrows()}
club_average_age = {r["ClubID"]:int(r["avgAge"]) for _, r in club_data.iterrows()}
clubs_average_market_value = {}
for _, r in club_data.iterrows():
    try:
        market_value = float(r["avgMarketValue"][1: len(r["avgMarketValue"]) -1] or np.nan)
    except:
        market_value = float(r["avgMarketValue"][1: len(r["avgMarketValue"]) -2] or np.nan)
    
    market_value = market_value * 10 ** 6 if "m" in r["avgMarketValue"] else market_value * 10 ** 3
    clubs_average_market_value.setdefault(r["ClubID"], market_value)

player_goals = {r["player_id"]:r["goals"] for _, r in player_data.iterrows() if not pd.isnull(r["goals"])}

In [4]:
average_goal_per_position = {}
total_season_per_player = {}
for _, r in club_player_data.iterrows():
    position = r["Player_possition"]
    player_id = r["PlayerID"]
    season = r["Season"]
    
    average_goal_per_position.setdefault(position, {"total_players": 0, "total_goals": 0})
    average_goal_per_position[position]["total_players"] += 1
    average_goal_per_position[position]["total_goals"] += player_goals.get(player_id, 0)
    
    total_season_per_player.setdefault(player_id, set()).add(season)

average_goal_per_position = {position: data["total_goals"] / data["total_players"] for position, data in average_goal_per_position.items()}
average_goal_per_position.pop("Attack", None)

defensive_positions = {p: position_to_id[p] for p, average in average_goal_per_position.items() if average < 1}
attacking_positions = {p: position_to_id[p] for p, average in average_goal_per_position.items() if average > 1}

total_season_per_player = {p: len(season) for p, season in total_season_per_player.items()}

In [5]:
json.dumps(average_goal_per_position)

'{"Goalkeeper": 0.0, "Centre-Back": 0.8371434792074897, "Left-Back": 0.5801566579634465, "Right-Back": 0.5931873479318734, "Defensive Midfield": 0.8880382775119617, "Central Midfield": 1.9294593813240821, "Right Midfield": 1.899441340782123, "Attacking Midfield": 3.348051948051948, "Left Winger": 3.905009759271308, "Right Winger": 3.710526315789474, "Centre-Forward": 6.5186230248307, "midfield": 0.46153846153846156, "Left Midfield": 2.9791666666666665, "Second Striker": 5.968503937007874, "Defender": 0.0}'

In [6]:
json.dumps(defensive_positions)

'{"Goalkeeper": 25863, "Centre-Back": 25867, "Left-Back": 25870, "Right-Back": 25874, "Defensive Midfield": 25878, "midfield": 25498, "Defender": 24963}'

In [7]:
json.dumps(attacking_positions)

'{"Central Midfield": 25883, "Right Midfield": 21010, "Attacking Midfield": 25885, "Left Winger": 25887, "Right Winger": 25891, "Centre-Forward": 25900, "Left Midfield": 25509, "Second Striker": 25403}'

In [None]:
cleaned_data = pd.DataFrame(columns=["Player", "Season", "Player_Marketvalue",
                                     "Position", "League", "Age", "Height", "Goals", "Club_Average_Marketvalue"])

extracted_data = []

for _, r in club_player_data.iterrows():
    
    player = r["PlayerID"]
    season = r["Season"]
    club_id = r["ClubID"]
    try:
        market_value = float(r["Player_MarketValue"][1: len(r["Player_MarketValue"]) -1] or np.nan)
    except:
        market_value = float(r["Player_MarketValue"][1: len(r["Player_MarketValue"]) -2] or np.nan)
    player_market_value = market_value * 10 ** 6 if "m" in r["Player_MarketValue"] else market_value * 10 ** 3
    position = position_to_id.get(r["Player_possition"], np.nan)
    league = league_to_id.get(r["League"], np.nan)
    age = season - player_birth.get(r["PlayerID"], club_average_age.get(r["ClubID"], np.nan))
    height = player_height.get(r["PlayerID"], np.nan)
    goals = player_goals.get(player, np.nan) / total_season_per_player.get(player, np.nan)
    club_average_market_value = clubs_average_market_value.get(club_id, np.nan)
    
    if pd.isnull(position):
        continue
    
    extracted_data.append({
        "Player": player,
        "Season": season,
        "Player_Marketvalue": player_market_value,
        "Position": position,
        "League": league,
        "Age": age,
        "Height": height,
        "Goals": goals,
        "Club_Average_Marketvalue": club_average_market_value,
    })

cleaned_data = cleaned_data.append(extracted_data, ignore_index=True, sort=False)

In [None]:
attacker_goals_mean = cleaned_data[cleaned_data["Position"].isin(attacking_positions.values())]["Goals"].mean()
defender_goals_mean = cleaned_data[cleaned_data["Position"].isin(defensive_positions.values())]["Goals"].mean()

print(f"Attack Position Mean Goals Per Season: {attacker_goals_mean}\nDefense Position Mean Goals Per Season: {defender_goals_mean}")

In [None]:
height_mean = cleaned_data["Height"].mean()
player_market_value_mean = cleaned_data["Player_Marketvalue"].mean()
age_mean = cleaned_data["Age"].mean()
goals_mean = cleaned_data[cleaned_data["Goals"] > 1]["Goals"].mean()
club_average_marketvalue_mean = cleaned_data["Club_Average_Marketvalue"].mean()
cleaned_data["Height"].fillna(value=height_mean, inplace=True)
cleaned_data["Player_Marketvalue"].fillna(value=player_market_value_mean, inplace=True)
cleaned_data["Age"].fillna(value=age_mean, inplace=True)
cleaned_data["Goals"].fillna(value=goals_mean, inplace=True)
cleaned_data["Club_Average_Marketvalue"].fillna(value=club_average_marketvalue_mean, inplace=True)
cleaned_data.head()

In [None]:
cleaned_data.drop(["Player", "Season"], axis=1, inplace=True)
cleaned_data.head()

In [None]:
cleaned_data.describe().transpose()

In [None]:
def train_and_predict_with_random_forest(data_set_name, data, label, n_estimators = 1000, train_porpotion=0.9):
    print(f"head of {data_set_name}:\n{data.head()}")
    print("\n" * 5)
    
    mask = np.random.rand(len(data)) < train_porpotion
    train = data[mask]
    test = data[~mask]

    print(f"Total Data Consecutively: Total: {len(data)}, Train: {len(train)}, Test: {len(test)}")
    print("\n" * 5)

    print("Normalizing Train and Test...")
    train_mean = train.mean()
    train_std = train.std()

    train = (train - train_mean) / train_std
    test = (test - train_mean) / train_std

    print(f"Train:\n{train.head()}")
    print("\n" * 3)
    print(f"Test:\n{test.head()}")
    print("\n" * 5)
    
    train_labels = train[label]
    test_labels = test[label]
    train_features = train.drop([label], axis=1)
    test_features = test.drop([label], axis=1)
    
    random_forest = RandomForestRegressor(n_estimators, random_state = 42)

    print("Fitting labels based on features...")
    random_forest.fit(train_features, train_labels)
    
    print("Predicting test features...")
    predictions = random_forest.predict(test_features)
    print(f"Predictions: {predictions}")
    print("\n" * 5)
    
    error = mean_absolute_percentage_error(test_labels, predictions)
    print(f"Prediction Accuracy based on mean absolute percentage error: {error}")

In [None]:
attacker_positions_data = cleaned_data[cleaned_data["Position"].isin(attacking_positions.values())]
defensive_positions_data = cleaned_data[cleaned_data["Position"].isin(defensive_positions.values())].drop(["Goals"], axis=1)

In [None]:
train_and_predict_with_random_forest("Attacker Data Set", attacker_positions_data, "Player_Marketvalue")

In [None]:
train_and_predict_with_random_forest("Defenders Data Set", defensive_positions_data, "Player_Marketvalue")