In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:


# Load the dataset
df = pd.read_csv("updated_player_data.csv")

In [3]:
# Clean and convert Wage column
df['Wage'] = df['Wage'].replace('[\€,K]', '', regex=True)  
df['Wage'] = pd.to_numeric(df['Wage'], errors='coerce')   

In [4]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 ID                      0
Name                    0
Age                     0
Nationality             0
Overall                 0
                     ... 
Goalkeeping             0
AllStats                0
Days Joined          1264
AttackingWorkRate       0
DefensiveWorkRate       0
Length: 64, dtype: int64


In [5]:
df.drop(columns=['Days Joined'], inplace=True)


In [6]:
# Select only numeric columns
df = df.select_dtypes(include=['number'])

In [7]:
# Check dataset 
print("Dataset shape after preprocessing:", df.shape)
print("First few rows:\n", df.head())

Dataset shape after preprocessing: (17918, 56)
First few rows:
        ID  Age  Overall  Potential        Value      Wage  Preferred Foot  \
0  158023   31       94         94  110500000.0  565000.0               0   
1   20801   33       94         94   77000000.0  405000.0               1   
2  190871   26       92         93  118500000.0  290000.0               1   
3  193080   27       91         93   72000000.0  260000.0               1   
4  192985   27       91         92  102000000.0  355000.0               1   

   International Reputation  Weak Foot  Skill Moves  ...  Attacking  Skill  \
0                       5.0        4.0          4.0  ...      425.0  467.0   
1                       5.0        4.0          5.0  ...      435.0  416.0   
2                       5.0        5.0          5.0  ...      396.0  444.0   
3                       4.0        3.0          1.0  ...      114.0  151.0   
4                       4.0        5.0          4.0  ...      404.0  436.0   

   M

In [8]:
# Define target and features
target_column = 'Wage'
features_columns = [col for col in df.columns if col != target_column]

x = df[features_columns]
y = df[target_column]

In [9]:
# Split data: 80% training, 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=38)

In [10]:
# Initialize and train the model
model = LinearRegression()
model.fit(x_train, y_train)





In [11]:
# Print model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Model Coefficients: [-1.99825435e-02  1.82525445e+02 -1.78257079e+01  4.59427568e+01
  2.89990541e-03  6.67569274e+01  9.48222840e+03 -9.73889105e+01
 -7.69331765e+02  1.20036325e+02  1.60692654e+01  1.70781597e+01
 -1.12311953e+01  2.33700065e+01 -8.78053581e+00 -1.74279987e+01
  2.98467347e+01  1.12739472e+01 -4.68068950e+01 -4.15229121e+01
  4.54599635e+01 -1.72759107e+01  2.96558419e+01 -4.25124928e-01
 -3.52614665e+01  1.97023483e+01  2.88188507e+01  1.04598781e+01
 -4.00678031e+01 -2.39681409e+01  1.47361309e+01 -5.62757896e+00
 -3.25660684e+00  4.14490771e+00 -4.28181571e+00  2.45293261e+01
 -1.53476759e+01 -2.47808675e+01 -3.60867381e+01  7.12693380e+01
  2.79464459e-01  4.26785380e+01  5.80526881e+00 -6.72698473e+01
  2.26633099e+01  3.00843631e+00 -1.74916165e+00 -3.60431181e+00
 -1.00210843e+01  1.60556446e-01  1.04017324e+01  4.15673392e+00
  2.35290131e+00  1.25127054e+02  6.74891443e+02]
Model Intercept: -22089.95942706528


In [12]:
# Predict on the test data
y_pred = model.predict(x_test)

In [13]:
# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

# Compare a few predictions vs actual values
comparison_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print(comparison_df.head())

Mean Squared Error (MSE): 136435571.02454534
R-squared (R2): 0.7428856821474263
    Actual    Predicted
0   1000.0   856.920490
1  22000.0  4822.655886
2   3000.0  3011.175792
3   1000.0   483.505411
4   1000.0  5904.255539


In [14]:
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17918 entries, 0 to 17917
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        17918 non-null  int64  
 1   Age                       17918 non-null  int64  
 2   Overall                   17918 non-null  int64  
 3   Potential                 17918 non-null  int64  
 4   Value                     17918 non-null  float64
 5   Wage                      17918 non-null  float64
 6   Preferred Foot            17918 non-null  int64  
 7   International Reputation  17918 non-null  float64
 8   Weak Foot                 17918 non-null  float64
 9   Skill Moves               17918 non-null  float64
 10  Height                    17918 non-null  float64
 11  Weight                    17918 non-null  float64
 12  Crossing                  17918 non-null  float64
 13  Finishing                 17918 non-null  float64
 14  Headin

In [15]:
# Load and preprocess data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x) 

# Split data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


In [16]:
# Initialize and train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Predict and evaluate
rf_pred = rf_model.predict(x_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest Results")
print("MSE:", rf_mse)
print("R² Score:", rf_r2)


Random Forest Results
MSE: 119116479.52008928
R² Score: 0.8018847151322315


In [17]:
# Define a simple Neural Network
nn_model = Sequential([
    Dense(64, activation='relu', input_dim=x_train.shape[1]),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = nn_model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test), verbose=1)

# Predict and evaluate
nn_pred = nn_model.predict(x_test).flatten()
nn_mse = mean_squared_error(y_test, nn_pred)
nn_r2 = r2_score(y_test, nn_pred)

print("Neural Network Results")
print("MSE:", nn_mse)
print("R² Score:", nn_r2)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 549607168.0000 - mae: 9729.0889 - val_loss: 540859456.0000 - val_mae: 7393.6045
Epoch 2/50
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 356854528.0000 - mae: 7008.0371 - val_loss: 406474048.0000 - val_mae: 7871.7803
Epoch 3/50
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 259196720.0000 - mae: 7568.7705 - val_loss: 373940576.0000 - val_mae: 7823.6021
Epoch 4/50
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 276538336.0000 - mae: 7629.6162 - val_loss: 344480064.0000 - val_mae: 7276.6055
Epoch 5/50
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 236882048.0000 - mae: 6862.9683 - val_loss: 313046080.0000 - val_mae: 6866.3501
Epoch 6/50
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 202679760.0000 - mae: 6419.3247 - val_loss: 279

In [18]:
results = pd.DataFrame({
    'Model': ['Random Forest', 'Neural Network'],
    'MSE': [rf_mse, nn_mse],
    'R² Score': [rf_r2, nn_r2]
})

print(results)


            Model           MSE  R² Score
0   Random Forest  1.191165e+08  0.801885
1  Neural Network  1.192541e+08  0.801656


In [None]:
from xgboost import XGBRegressor
# Initialize and train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
xgb_pred = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print("XGBoost Results")
print("MSE:", xgb_mse)
print("R² Score:", xgb_r2)

Missing values per column:
 ID                      0
Name                    0
Age                     0
Nationality             0
Overall                 0
                     ... 
Goalkeeping             0
AllStats                0
Days Joined          1264
AttackingWorkRate       0
DefensiveWorkRate       0
Length: 64, dtype: int64
