In [58]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [59]:


# Load the dataset
df = pd.read_csv("updated_player_data.csv")

In [60]:
# Clean and convert Wage column
df['Wage'] = df['Wage'].replace('[\€,K]', '', regex=True)  
df['Wage'] = pd.to_numeric(df['Wage'], errors='coerce')   

In [61]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 ID                      0
Name                    0
Age                     0
Nationality             0
Overall                 0
                     ... 
Goalkeeping             0
AllStats                0
Days Joined          1264
AttackingWorkRate       0
DefensiveWorkRate       0
Length: 64, dtype: int64


In [62]:
df.drop(columns=['Days Joined'], inplace=True)
df.drop(columns=['ID'], inplace=True)


In [63]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Name                 0
Age                  0
Nationality          0
Overall              0
Potential            0
                    ..
Defending            0
Goalkeeping          0
AllStats             0
AttackingWorkRate    0
DefensiveWorkRate    0
Length: 62, dtype: int64


In [64]:
# Select only numeric columns
df = df.select_dtypes(include=['number'])

In [65]:
# Check dataset 
print("Dataset shape after preprocessing:", df.shape)
print("First few rows:\n", df.head())

Dataset shape after preprocessing: (17918, 55)
First few rows:
    Age  Overall  Potential        Value      Wage  Preferred Foot  \
0   31       94         94  110500000.0  565000.0               0   
1   33       94         94   77000000.0  405000.0               1   
2   26       92         93  118500000.0  290000.0               1   
3   27       91         93   72000000.0  260000.0               1   
4   27       91         92  102000000.0  355000.0               1   

   International Reputation  Weak Foot  Skill Moves  Height  ...  Attacking  \
0                       5.0        4.0          4.0    67.0  ...      425.0   
1                       5.0        4.0          5.0    74.0  ...      435.0   
2                       5.0        5.0          5.0    69.0  ...      396.0   
3                       4.0        3.0          1.0    76.0  ...      114.0   
4                       4.0        5.0          4.0    71.0  ...      404.0   

   Skill  Movement  Power  Mentality  Defendin

In [66]:
# Define target and features
target_column = 'Wage'
features_columns = [col for col in df.columns if col != target_column]

x = df[features_columns]
y = df[target_column]

In [67]:
# Split data: 80% training, 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=38)

In [68]:
# Initialize and train the model
model = LinearRegression()
model.fit(x_train, y_train)





In [69]:
# Print model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Model Coefficients: [ 2.83502083e+02 -4.60186350e+01  7.18785014e+01  2.89036227e-03
  6.85103057e+01  9.75420042e+03 -9.33796526e+01 -7.66639027e+02
  1.41526878e+02  1.59831712e+01  1.94093537e+01 -1.14236680e+01
  2.24966839e+01 -1.14842230e+01 -1.62294385e+01  2.49913574e+01
  1.24272614e+01 -4.75843484e+01 -4.05603501e+01  4.92757414e+01
 -1.91991675e+01  2.76263770e+01  3.01638990e+00 -3.68939769e+01
  2.18410333e+01  2.95014052e+01  1.20777759e+01 -4.22470803e+01
 -2.33751736e+01  1.39736284e+01 -4.50461221e+00 -1.54990877e+00
  2.68376294e+00 -6.24068541e+00  2.30070217e+01 -1.23019927e+01
 -2.31840564e+01 -3.58862346e+01  6.83709161e+01 -1.21154383e-01
  4.43252512e+01  5.00974749e+00 -6.55788562e+01  2.11677317e+01
  2.76870808e+00 -1.45033829e+00 -3.60934410e+00 -1.00694445e+01
  1.09358554e+00  9.30062515e+00  4.80271971e+00  2.83651160e+00
  1.16587069e+02  6.74345082e+02]
Model Intercept: -31708.797172604405


In [70]:
# Predict on the test data
y_pred = model.predict(x_test)

In [71]:
# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

# Compare a few predictions vs actual values
comparison_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print(comparison_df.head())

Mean Squared Error (MSE): 136263713.95430303
R-squared (R2): 0.7432095486658985
    Actual    Predicted
0   1000.0   725.179586
1  22000.0  4576.723875
2   3000.0  3295.420983
3   1000.0   508.632077
4   1000.0  5991.088650


In [72]:
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17918 entries, 0 to 17917
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       17918 non-null  int64  
 1   Overall                   17918 non-null  int64  
 2   Potential                 17918 non-null  int64  
 3   Value                     17918 non-null  float64
 4   Wage                      17918 non-null  float64
 5   Preferred Foot            17918 non-null  int64  
 6   International Reputation  17918 non-null  float64
 7   Weak Foot                 17918 non-null  float64
 8   Skill Moves               17918 non-null  float64
 9   Height                    17918 non-null  float64
 10  Weight                    17918 non-null  float64
 11  Crossing                  17918 non-null  float64
 12  Finishing                 17918 non-null  float64
 13  HeadingAccuracy           17918 non-null  float64
 14  ShortP

In [73]:
# Load and preprocess data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x) 

# Split data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=500)
rf_model.fit(x_train, y_train)

# Predict and evaluate
rf_pred = rf_model.predict(x_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest Results")
print("MSE:", rf_mse)
print("R² Score:", rf_r2)


Random Forest Results
MSE: 117226268.57366072
R² Score: 0.8050285259770501


In [75]:
# Define a simple Neural Network
nn_model = Sequential([
    Dense(64, activation='relu', input_dim=x_train.shape[1]),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = nn_model.fit(x_train, y_train, epochs=500, batch_size=32, validation_data=(x_test, y_test), verbose=1)

# Predict and evaluate
nn_pred = nn_model.predict(x_test).flatten()
nn_mse = mean_squared_error(y_test, nn_pred)
nn_r2 = r2_score(y_test, nn_pred)

print("Neural Network Results")
print("MSE:", nn_mse)
print("R² Score:", nn_r2)


Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 552795456.0000 - mae: 9672.7432 - val_loss: 566245632.0000 - val_mae: 7615.5010
Epoch 2/500
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 363498272.0000 - mae: 6934.8066 - val_loss: 409417952.0000 - val_mae: 7860.0991
Epoch 3/500
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 296358464.0000 - mae: 7719.1929 - val_loss: 375143360.0000 - val_mae: 7776.9214
Epoch 4/500
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 281757216.0000 - mae: 7547.3965 - val_loss: 344496320.0000 - val_mae: 7329.8843
Epoch 5/500
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 215325712.0000 - mae: 6852.3242 - val_loss: 311997984.0000 - val_mae: 7042.5088
Epoch 6/500
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 204224752.0000 - mae: 6552.4087 - val_loss

In [76]:
# Initialize and train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=500)
xgb_model.fit(x_train, y_train)

# Predict and evaluate
xgb_pred = xgb_model.predict(x_test)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print("XGBoost Results")
print("MSE:", xgb_mse)
print("R² Score:", xgb_r2)

XGBoost Results
MSE: 111348308.00296648
R² Score: 0.8148047873104637


In [77]:
results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Neural Network'],
    'MSE': [rf_mse, xgb_mse, nn_mse],
    'R² Score': [rf_r2, xgb_r2, nn_r2]
})

print(results)

            Model           MSE  R² Score
0   Random Forest  1.172263e+08  0.805029
1         XGBoost  1.113483e+08  0.814805
2  Neural Network  1.066936e+08  0.822546
