# Plotting Tree Depth With Extreme Gradient Boost


### Initialization

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error


filename = "OneHotEncodedData"

target_value = "Logarithmic Market Value"
ignored_features = ["Club Goals", "Goal Difference", "Goals/Game", "Yellow Cards", "Assists/Game", "Red Cards", "Market Value", "Name", target_value]

df = pd.read_csv(f"../DataSets/EncodedData/{filename}.csv", sep=",", encoding="UTF-8")
    
X = df.drop(columns=ignored_features)
y = df[target_value]
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y, test_size=0.10, random_state=42)

model = XGBRegressor(n_estimators=2000, max_depth=7, learning_rate = 0.25, random_state=42)

model.fit(X_train, y_train_log)

y_train = np.power(10, y_train_log)
y_test = np.power(10, y_test_log)

# Make predictions on the training data
y_train_pred_log = model.predict(X_train)
y_train_pred = np.power(10, y_train_pred_log)



# Calculate MAE and MAPE on the training data
mae_train = mean_absolute_error(y_true=y_train, y_pred=y_train_pred)
mape_train = mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred)

print("Training Set Scores:")
print("Mean Absolute Error (MAE):", round(mae_train / 1000000, 2), "M")
print("Mean Absolute Percentage Error (MAPE):", round(mape_train*100, 2), "%")

# Make predictions on the test data
y_pred_log = model.predict(X_test)
y_pred = np.power(10, y_pred_log)

# Calculate MAE and MAPE on the test data
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred)

print("\nTest Set Scores:")
print("Mean Absolute Error (MAE):", round(mae / 1000000, 2), "M")
print("Mean Absolute Percentage Error (MAPE):", round(mape*100, 2), "%")

print(round(mae / 1000000, 2), "M &", round(mape*100, 2), "\% &", round(mae_train / 1000000, 2), "M &", round(mape_train*100, 2), "\%")

In [None]:
mae_values = []
mae_train_values = []

for learn_rate in np.arange(1, 15):
    model = XGBRegressor(n_estimators=1000, max_depth=learn_rate, random_state=42, learning_rate=0.25)
    mae, mae_train = train(model, X_train, X_test, y_train, y_test)
    print(mae, mae_train)
    mae_values.append(mae)
    mae_train_values.append(mae_train)

In [None]:
# Plot MAE vs MAE_train
depths = np.arange(1, 15)
plt.plot(depths, mae_values, label='MAE on Test Set')
plt.plot(depths, mae_train_values, label='MAE on Training Set')
plt.xlabel('Learning rate')
plt.ylabel('Mean Absolute Error')
plt.title('MAE vs Learning Rate')
plt.legend()
plt.show()

In [None]:
results, z = display(df, X_test, y_pred)
results


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Calculate distances of each point from the optimal line
distances = np.abs(y_pred_original - y_test_original)

# Find the indices of points with maximum distances
max_distance_indices = np.argsort(distances)[-10:]  # Adjust the number to show more or fewer names

# Get the corresponding player names
furthest_names = z.iloc[max_distance_indices]['name'].tolist()

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test_original, y_pred_original, s=20, color='blue', alpha=0.5, marker='o', label='Predicted Values')
plt.xlabel("Real Value (EUR)")
plt.ylabel("Predicted Value (EUR)")
plt.title("Real Value vs Predicted Value")
plt.plot([min(y_test_original), max(y_test_original)], [min(y_test_original), max(y_test_original)], color='red', label='Perfect Prediction Line')
plt.text(min(y_test_original), max(y_pred_original), "Overestimation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_test_original), min(y_pred_original), "Underestimation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')

# Annotate only the furthest points
for index, row in z.iterrows():
    if row['name'] in furthest_names:
        plt.annotate(row['name'], (row['market_value_in_eur'], row['Predicted_Value']), fontsize=6, color='black')

# Set formatter for x-axis and y-axis
def millions_formatter(x, pos):
    return f'{x / 1000000:.0f}M'

formatter = FuncFormatter(millions_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.grid(alpha=0.2, linestyle='--', zorder=1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
y_train.shape
y_train_original = np.power(10, y_train)
y_train_original.shape
print(y_train_original.shape, y_train_pred_original.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Calculate distances of each point from the optimal line

y_train_original = np.power(10, y_train)
distances = np.abs(y_train_pred_original - y_train_original)

# Find the indices of points with maximum distances
max_distance_indices = np.argsort(distances)[-10:]  # Adjust the number to show more or fewer names

# Get the corresponding player names
#furthest_names = z.iloc[max_distance_indices]['name'].tolist()

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_train_original, y_train_pred_original, s=20, color='blue', alpha=0.5, marker='o', label='Predicted Values')
plt.xlabel("Real Value (EUR)")
plt.ylabel("Predicted Value (EUR)")
plt.title("Real Value vs Predicted Value")
plt.plot([min(y_train_pred_original), max(y_train_pred_original)], [min(y_train_pred_original), max(y_train_pred_original)], color='red', label='Perfect Prediction Line')
plt.text(min(y_train_original), max(y_train_pred_original), "Overestimation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_train_original), min(y_train_pred_original), "Underestimation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')

# Annotate only the furthest points
#for index, row in z.iterrows():
#    if row['name'] in furthest_names:
#        plt.annotate(row['name'], (row['market_value_in_eur'], row['Predicted_Value']), fontsize=6, color='black')

# Set formatter for x-axis and y-axis
def millions_formatter(x, pos):
    return f'{x / 1000000:.0f}M'

formatter = FuncFormatter(millions_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.grid(alpha=0.2, linestyle='--', zorder=1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Scatter plot
plt.scatter(y_test, y_pred, s=20, color='blue', alpha=0.5)  # Set dot size to 20, blue color, and transparency to 0.5
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Value vs Predicted value")

# Add diagonal line
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.text(min(y_test), max(y_pred), "Over evaluation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_test), min(y_pred), "Under evaluation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')
plt.grid(alpha=0.2, zorder=1)

plt.show()


In [None]:
import numpy as np

# Calculate distances of each point from the optimal line
distances = np.abs(y_pred_original - y_test_original)

# Find the indices of points with maximum distances
max_distance_indices = np.argsort(distances)[-30:]  # Adjust the number to show more or fewer names

# Get the corresponding player names
furthest_names = z.iloc[max_distance_indices]['name'].tolist()

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test_original, y_pred_original, s=20, color='blue', alpha=0.5, marker='o', label='Predicted Values')
plt.xlabel("True Value (EUR)")
plt.ylabel("Predicted Value (EUR)")
plt.title("True Value vs Predicted Value (XGBoost)")
plt.plot([min(y_test_original), max(y_test_original)], [min(y_test_original), max(y_test_original)], color='red', label='Perfect Prediction Line')
plt.text(min(y_test_original), max(y_pred_original), "Overestimation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_test_original), min(y_pred_original), "Underestimation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')

# Annotate only the furthest points
for index in max_distance_indices:
    row = z.iloc[index]
    plt.annotate(row['name'], (row['market_value_in_eur'], row['Predicted_Value']), fontsize=6, color='black')

# Set formatter for x-axis and y-axis
def millions_formatter(x, pos):
    return f'{x / 1000000:.0f}M'

# Identify the 1% most valuable players
top_1_percent = z.nlargest(int(len(z) * 0.05), 'market_value_in_eur')
top_1_percent = top_1_percent.sort_values(by='market_value_in_eur')
lowest_value_player = top_1_percent.iloc[0]['market_value_in_eur']
# Add vertical line for the 1% most valuable players
print(lowest_value_player)
plt.axvline(x=lowest_value_player, color='gray', linestyle='--', label='5% Most Valuable Players', alpha=0.5)
    
formatter = FuncFormatter(millions_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.grid(alpha=0.2, linestyle='--', zorder=1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
XGBRegressor.plot_importance(model)