In [13]:
data = pd.read_excel("Cleaned_Merged_PCA_KM_with_Impact_Score_Till_2021.xlsx")

In [14]:
data.isnull().sum()

PLAYER           0
PCA_1            0
PCA_2            0
PCA_3            0
PCA_4            0
PCA_5            0
PCA_6            0
PCA_7            0
PCA_8            0
PCA_9            0
PCA_10           0
PCA_11           0
PCA_12           0
PCA_13           0
clusters         0
Impact_Score    19
dtype: int64

In [15]:
# Replace null values with 0
data = data.fillna(0)

# Verify that there are no null values left
print(data.isnull().sum())


PLAYER          0
PCA_1           0
PCA_2           0
PCA_3           0
PCA_4           0
PCA_5           0
PCA_6           0
PCA_7           0
PCA_8           0
PCA_9           0
PCA_10          0
PCA_11          0
PCA_12          0
PCA_13          0
clusters        0
Impact_Score    0
dtype: int64


In [None]:
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from itertools import combinations
import numpy as np

warnings.filterwarnings("ignore")
# Separate features and target
features = [f"PCA_{i}" for i in range(1, 14)] + ["clusters"]  # PCA features + cluster
target = "Impact_Score"

X = data[features]
y = data[target]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=20)
rf.fit(X_train, y_train)

# Validation performance
y_val_pred = rf.predict(X_val)
print(f"Validation MSE: {mean_squared_error(y_val, y_val_pred):.4f}")
print(f"Validation R²: {r2_score(y_val, y_val_pred):.4f}")

# Feature Importance
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

# Generate all possible 5-player combinations
player_ids = data['PLAYER']  # Keep PLAYER names for reference
player_features = X.values  # Features for all players
combinations_scores = []

# Evaluate all 5-player combinations
for team in combinations(range(len(player_ids)), 5):
    team_features = np.sum(player_features[list(team)], axis=0).reshape(1, -1)  # Sum features for team
    team_score = rf.predict(team_features)[0]  # Predict Impact Score for team
    combinations_scores.append((team, team_score))

# Find the best team
best_team = max(combinations_scores, key=lambda x: x[1])
best_team_ids = [player_ids.iloc[i] for i in best_team[0]]
best_team_score = best_team[1]

print("\nBest Team of 5 Players:")
print(best_team_ids)
print(f"Predicted Impact Score: {best_team_score:.2f}")


Validation MSE: 33.4351
Validation R²: 0.4446

Feature Importances:
     Feature  Importance
0      PCA_1    0.140303
1      PCA_2    0.111842
5      PCA_6    0.107797
12    PCA_13    0.104338
7      PCA_8    0.084455
11    PCA_12    0.076690
9     PCA_10    0.068811
6      PCA_7    0.065712
4      PCA_5    0.056915
3      PCA_4    0.053465
2      PCA_3    0.049523
10    PCA_11    0.040336
8      PCA_9    0.035424
13  clusters    0.004388
