# Extreme Gradient Boost


### Initialization

In [None]:

from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from utility import start_xgb
from utility import train
from utility import display


X_train, X_test, y_train, y_test, df = start_xgb("EncodedData.csv")
X_test.head()


In [None]:
params = {
    'n_estimators': 3392,
    'max_depth': 10,
    'learning_rate': 0.02,
    'subsample': 0.89,
    'colsample_bytree': 0.88,
    'gamma': 0.00,
    'alpha': 0.59,
    'lambda': 0.25,
    'min_child_weight': 1
}
model = XGBRegressor(**params, random_state=42)
y_pred, y_pred_original, y_test_original = train(model, X_train, X_test, y_train, y_test)


In [None]:
results, z = display(df, X_test, y_pred)
results

In [None]:
distances = np.abs(y_pred_original - y_test_original)
max_distance_indices = np.argsort(distances)[-10:]
furthest_names = z.iloc[max_distance_indices]['Name'].tolist()

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test_original, y_pred_original, s=20, color='blue', alpha=0.5, marker='o', label='Predicted Values')
plt.xlabel("Real Value (EUR)")
plt.ylabel("Predicted Value (EUR)")
plt.title("Real Value vs Predicted Value")
plt.plot([min(y_test_original), max(y_test_original)], [min(y_test_original), max(y_test_original)], color='red', label='Perfect Prediction Line')
plt.text(min(y_test_original), max(y_pred_original), "Overestimation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_test_original), min(y_pred_original), "Underestimation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')

for index, row in z.iterrows():
    if row['Name'] in furthest_names:
        plt.annotate(row['Name'], (row['Market Value'], row['Predicted Value']), fontsize=6, color='black')

def millions_formatter(x, pos):
    return f'{x / 1000000:.0f}M'

formatter = FuncFormatter(millions_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.grid(alpha=0.2, linestyle='--', zorder=1)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np

# Calculate distances of each point from the optimal line
distances = np.abs(y_pred_original - y_test_original)

# Find the indices of points with maximum distances
max_distance_indices = np.argsort(distances)[-30:]  # Adjust the number to show more or fewer names

# Get the corresponding player names
furthest_names = z.iloc[max_distance_indices]['Name'].tolist()

# Plot
plt.figure(figsize=(12, 6))
plt.scatter(y_test_original, y_pred_original, s=20, color='blue', alpha=0.5, marker='o', label='Predicted Values')
plt.xlabel("True Value (EUR)")
plt.ylabel("Predicted Value (EUR)")
plt.title("True Value vs Predicted Value (XGBoost)")
plt.plot([min(y_test_original), max(y_test_original)], [min(y_test_original), max(y_test_original)], color='red', label='Perfect Prediction Line')
plt.text(min(y_test_original), max(y_pred_original), "Overestimation", fontsize=10, color='green', verticalalignment='top', horizontalalignment='left')
plt.text(max(y_test_original), min(y_pred_original), "Underestimation", fontsize=10, color='orange', verticalalignment='bottom', horizontalalignment='right')

# Annotate only the furthest points
for index in max_distance_indices:
    row = z.iloc[index]
    plt.annotate(row['Name'], (row['Market Value'], row['Predicted Value']), fontsize=6, color='black')

# Set formatter for x-axis and y-axis
def millions_formatter(x, pos):
    return f'{x / 1000000:.0f}M'

# Identify the 1% most valuable players
top_1_percent = z.nlargest(int(len(z) * 0.05), 'Market Value')
top_1_percent = top_1_percent.sort_values(by='Market Value')
lowest_value_player = top_1_percent.iloc[0]['Market Value']
# Add vertical line for the 1% most valuable players
print(lowest_value_player)
plt.axvline(x=lowest_value_player, color='gray', linestyle='--', label='5% Most Valuable Players', alpha=0.5)
    
formatter = FuncFormatter(millions_formatter)
plt.gca().xaxis.set_major_formatter(formatter)
plt.gca().yaxis.set_major_formatter(formatter)

plt.grid(alpha=0.2, linestyle='--', zorder=1)
plt.legend()
plt.tight_layout()
plt.show()
