In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("fifa21_raw_data_v2.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 77 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                18979 non-null  int64 
 1   Name              18979 non-null  object
 2   LongName          18979 non-null  object
 3   photoUrl          18979 non-null  object
 4   playerUrl         18979 non-null  object
 5   Nationality       18979 non-null  object
 6   Age               18979 non-null  int64 
 7   ↓OVA              18979 non-null  int64 
 8   POT               18979 non-null  int64 
 9   Club              18979 non-null  object
 10  Contract          18979 non-null  object
 11  Positions         18979 non-null  object
 12  Height            18979 non-null  object
 13  Weight            18979 non-null  object
 14  Preferred Foot    18979 non-null  object
 15  BOV               18979 non-null  int64 
 16  Best Position     18979 non-null  object
 17  Joined      

In [3]:
df.describe()

Unnamed: 0,ID,Age,↓OVA,POT,BOV,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,...,GK Positioning,GK Reflexes,Total Stats,Base Stats,PAC,SHO,PAS,DRI,DEF,PHY
count,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,...,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0,18979.0
mean,226403.384794,25.194109,65.718636,71.136414,66.751726,248.938142,49.688392,45.842405,51.942726,58.768112,...,16.217187,16.519627,1595.286949,355.702197,67.453975,53.457031,57.681016,62.87502,49.866221,64.368934
std,27141.054157,4.71052,6.968999,6.114635,6.747193,74.299428,18.131153,19.567081,17.294409,14.519106,...,17.002239,17.854079,269.874789,40.761117,10.677859,13.827425,10.081857,9.927415,16.443213,9.601883
min,41.0,16.0,47.0,47.0,48.0,42.0,6.0,3.0,5.0,7.0,...,2.0,2.0,747.0,232.0,25.0,16.0,25.0,25.0,12.0,28.0
25%,210135.0,21.0,61.0,67.0,62.0,222.0,38.0,30.0,44.0,54.0,...,8.0,8.0,1452.0,327.0,61.0,44.0,51.0,57.0,35.0,58.0
50%,232418.0,25.0,66.0,71.0,67.0,263.0,54.0,49.0,55.0,62.0,...,11.0,11.0,1627.0,356.0,68.0,56.0,58.0,64.0,53.0,65.0
75%,246922.5,29.0,70.0,75.0,71.0,297.0,63.0,62.0,64.0,68.0,...,14.0,14.0,1781.0,384.0,75.0,64.0,64.0,69.0,63.0,71.0
max,259216.0,53.0,93.0,95.0,93.0,437.0,94.0,95.0,93.0,94.0,...,91.0,90.0,2316.0,498.0,96.0,93.0,93.0,95.0,91.0,91.0


In [4]:
df.head()
list(df.isna().sum())

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 17966,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2595]

In [5]:
# Fixing Loan Date End
df["Loan Date End"].fillna("Not Present",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Loan Date End"].fillna("Not Present",inplace=True)


In [6]:

def convertHeight(height):
    # Handle missing values
    if pd.isna(height):
        return height
    # Ensure the value is a string for processing
    if not isinstance(height, str):
        height = str(height)
    # If height is provided in centimeters (e.g., "180cm")
    if 'cm' in height:
        try:
            return float(height.replace('cm', '').strip())
        except Exception as e:
            print(f"Error converting height '{height}': {e}")
            return np.nan
    # If height is provided in feet and inches (e.g., "5'11\"")
    elif "'" in height and '"' in height:
        try:
            parts = height.split("'")
            feet = float(parts[0].strip())
            inches = float(parts[1].replace('"', '').strip())
            # Convert feet and inches to centimeters:
            # 1 foot = 30.48 cm, 1 inch = 2.54 cm
            return feet * 30.48 + inches * 2.54
        except Exception as e:
            print(f"Error parsing height '{height}': {e}")
            return np.nan
    else:
        # Attempt to convert directly if no expected unit is found
        try:
            return float(height)
        except Exception as e:
            print(f"Unable to convert height '{height}': {e}")
            return np.nan

def convertWeight(weight):
    # Handle missing values
    if pd.isna(weight):
        return weight
    # Ensure the value is a string for processing
    if not isinstance(weight, str):
        weight = str(weight)
    # If weight is provided in kilograms (e.g., "70kg")
    if 'kg' in weight:
        try:
            return float(weight.replace('kg', '').strip())
        except Exception as e:
            print(f"Error converting weight '{weight}': {e}")
            return np.nan
    # If weight is provided in pounds (e.g., "154lbs"), convert to kg
    elif 'lbs' in weight:
        try:
            lbs = float(weight.replace('lbs', '').strip())
            # 1 lb = 0.453592 kg
            return lbs * 0.453592
        except Exception as e:
            print(f"Error converting weight from lbs '{weight}': {e}")
            return np.nan
    else:
        # Attempt to convert directly if no expected unit is found
        try:
            return float(weight)
        except Exception as e:
            print(f"Unable to convert weight '{weight}': {e}")
            return np.nan

In [7]:
df['Height'].unique()

array(['170cm', '187cm', '188cm', '181cm', '175cm', '184cm', '191cm',
       '178cm', '193cm', '185cm', '199cm', '173cm', '168cm', '176cm',
       '177cm', '183cm', '180cm', '189cm', '179cm', '195cm', '172cm',
       '182cm', '186cm', '192cm', '165cm', '194cm', '167cm', '196cm',
       '163cm', '190cm', '174cm', '169cm', '171cm', '197cm', '200cm',
       '166cm', '6\'2"', '164cm', '198cm', '6\'3"', '6\'5"', '5\'11"',
       '6\'4"', '6\'1"', '6\'0"', '5\'10"', '5\'9"', '5\'6"', '5\'7"',
       '5\'4"', '201cm', '158cm', '162cm', '161cm', '160cm', '203cm',
       '157cm', '156cm', '202cm', '159cm', '206cm', '155cm'], dtype=object)

In [8]:
df['Weight'].unique()

array(['72kg', '83kg', '87kg', '70kg', '68kg', '80kg', '71kg', '91kg',
       '73kg', '85kg', '92kg', '69kg', '84kg', '96kg', '81kg', '82kg',
       '75kg', '86kg', '89kg', '74kg', '76kg', '64kg', '78kg', '90kg',
       '66kg', '60kg', '94kg', '79kg', '67kg', '65kg', '59kg', '61kg',
       '93kg', '88kg', '97kg', '77kg', '62kg', '63kg', '95kg', '100kg',
       '58kg', '183lbs', '179lbs', '172lbs', '196lbs', '176lbs', '185lbs',
       '170lbs', '203lbs', '168lbs', '161lbs', '146lbs', '130lbs',
       '190lbs', '174lbs', '148lbs', '165lbs', '159lbs', '192lbs',
       '181lbs', '139lbs', '154lbs', '157lbs', '163lbs', '98kg', '103kg',
       '99kg', '102kg', '56kg', '101kg', '57kg', '55kg', '104kg', '107kg',
       '110kg', '53kg', '50kg', '54kg', '52kg'], dtype=object)

In [9]:
df['Height'] = df['Height'].apply(convertHeight)
df['Weight'] = df['Weight'].apply(convertWeight)

In [10]:
print(df.head())

       ID               Name                      LongName  \
0  158023           L. Messi                  Lionel Messi   
1   20801  Cristiano Ronaldo  C. Ronaldo dos Santos Aveiro   
2  200389           J. Oblak                     Jan Oblak   
3  192985       K. De Bruyne               Kevin De Bruyne   
4  190871          Neymar Jr    Neymar da Silva Santos Jr.   

                                           photoUrl  \
0  https://cdn.sofifa.com/players/158/023/21_60.png   
1  https://cdn.sofifa.com/players/020/801/21_60.png   
2  https://cdn.sofifa.com/players/200/389/21_60.png   
3  https://cdn.sofifa.com/players/192/985/21_60.png   
4  https://cdn.sofifa.com/players/190/871/21_60.png   

                                           playerUrl Nationality  Age  ↓OVA  \
0  http://sofifa.com/player/158023/lionel-messi/2...   Argentina   33    93   
1  http://sofifa.com/player/20801/c-ronaldo-dos-s...    Portugal   35    92   
2  http://sofifa.com/player/200389/jan-oblak/210006/    Slo

In [11]:
df['Height'].unique()

array([170.  , 187.  , 188.  , 181.  , 175.  , 184.  , 191.  , 178.  ,
       193.  , 185.  , 199.  , 173.  , 168.  , 176.  , 177.  , 183.  ,
       180.  , 189.  , 179.  , 195.  , 172.  , 182.  , 186.  , 192.  ,
       165.  , 194.  , 167.  , 196.  , 163.  , 190.  , 174.  , 169.  ,
       171.  , 197.  , 200.  , 166.  , 187.96, 164.  , 198.  , 190.5 ,
       195.58, 180.34, 193.04, 185.42, 182.88, 177.8 , 175.26, 167.64,
       170.18, 162.56, 201.  , 158.  , 162.  , 161.  , 160.  , 203.  ,
       157.  , 156.  , 202.  , 159.  , 206.  , 155.  ])

In [12]:
df['Weight'].unique()

array([ 72.      ,  83.      ,  87.      ,  70.      ,  68.      ,
        80.      ,  71.      ,  91.      ,  73.      ,  85.      ,
        92.      ,  69.      ,  84.      ,  96.      ,  81.      ,
        82.      ,  75.      ,  86.      ,  89.      ,  74.      ,
        76.      ,  64.      ,  78.      ,  90.      ,  66.      ,
        60.      ,  94.      ,  79.      ,  67.      ,  65.      ,
        59.      ,  61.      ,  93.      ,  88.      ,  97.      ,
        77.      ,  62.      ,  63.      ,  95.      , 100.      ,
        58.      ,  83.007336,  81.192968,  78.017824,  88.904032,
        79.832192,  83.91452 ,  77.11064 ,  92.079176,  76.203456,
        73.028312,  66.224432,  58.96696 ,  86.18248 ,  78.925008,
        67.131616,  74.84268 ,  72.121128,  87.089664,  82.100152,
        63.049288,  69.853168,  71.213944,  73.935496,  98.      ,
       103.      ,  99.      , 102.      ,  56.      , 101.      ,
        57.      ,  55.      , 104.      , 107.      , 110.   

In [13]:
# Fixing the star in W/F,SM,IR
df["W/F"] = df["W/F"].astype(str).str.replace("★", "").astype(int)
df["SM"] = df["SM"].astype(str).str.replace("★", "").astype(int)
df["IR"] = df["IR"].astype(str).str.replace("★", "").astype(int)

In [14]:
## Fixing the club data that contain new line returns
df["Club"] = df["Club"].astype(str).str.replace("\n", "")

In [15]:
def convertValue(value):
    if isinstance(value, str):  
        value = value.strip('€') 
        if 'M' in value:
            value = float(value.replace('M', '').strip()) * 1000000  
        elif 'K' in value:
            value = float(value.replace('K', '').strip()) * 1000  
        else:
            value = float(value)  
        return float(value)  


def contractLengthCalc():
    length = int(len(df.index))
    for i in range(0,length ):
        value = str(df.loc[i, 'Contract'].strip())
        condtionalCheck = str(df.loc[i,'Contract'][0]).strip().isdigit()
        if condtionalCheck:
            dateList = value.split("~")
            startDate = int(dateList[0])
            endDate = int(dateList[1])
            df.loc[i, 'Contract_Length'] = endDate - startDate
        else:
            df.loc[i, 'Contract_Length'] = -1
            
contractLengthCalc()


# Convert prefered foot to 1 or 2, left or right
# Left foot is a zero....right foot will be zero
df['Preferred Foot'].replace({'Left': 0, 'Right': 1})

  df['Preferred Foot'].replace({'Left': 0, 'Right': 1})


0        0
1        1
2        1
3        1
4        1
        ..
18974    1
18975    1
18976    1
18977    1
18978    0
Name: Preferred Foot, Length: 18979, dtype: int64

In [16]:
df['Value'] = df['Value'].apply(convertValue)
df['Wage'] = df['Wage'].apply(convertValue)
# df['Release Clause'] = df['Release Clause'].apply(convertValue)
df['Hits'] = df['Hits'].apply(convertValue)

df['Value'] = pd.to_numeric(df['Value'])
df['Wage'] = pd.to_numeric(df['Wage'])
df['Hits'] = pd.to_numeric(df['Hits'])
df['Contract_Length'] = pd.to_numeric(df['Contract_Length'])
df['A/W'].replace({'Low': 0, 'Medium': 1, 'High':2})
df['D/W'].replace({'Low': 0, 'Medium': 1, 'High':2})



  df['A/W'].replace({'Low': 0, 'Medium': 1, 'High':2})
  df['D/W'].replace({'Low': 0, 'Medium': 1, 'High':2})


0        0
1        0
2        1
3        2
4        1
        ..
18974    1
18975    1
18976    1
18977    1
18978    1
Name: D/W, Length: 18979, dtype: int64

In [17]:
df["W/F"] = df["W/F"].astype(str).str.replace("★", "").astype(int)
df["SM"] = df["SM"].astype(str).str.replace("★", "").astype(int)
df["IR"] = df["IR"].astype(str).str.replace("★", "").astype(int)

In [18]:
#ENCODING 

from sklearn.preprocessing import LabelEncoder

# Copy dataset to avoid modifying the original
df_processed = df.copy()

# Initialize dictionary to store label encoders for future inverse transformation
label_encoders = {}

# Apply Label Encoding to categorical features
for col in ['Club', 'Nationality', 'Preferred Foot', 'Best Position', 'A/W', 'D/W']:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])  
    label_encoders[col] = le  # Store encoder for later use

# Display new dataframe structure
df_processed.info()

# Display unique values of "A/W" and "D/W" after encoding
print("Encoded 'A/W' unique values:", df_processed['A/W'].unique())
print("Encoded 'D/W' unique values:", df_processed['D/W'].unique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                18979 non-null  int64  
 1   Name              18979 non-null  object 
 2   LongName          18979 non-null  object 
 3   photoUrl          18979 non-null  object 
 4   playerUrl         18979 non-null  object 
 5   Nationality       18979 non-null  int64  
 6   Age               18979 non-null  int64  
 7   ↓OVA              18979 non-null  int64  
 8   POT               18979 non-null  int64  
 9   Club              18979 non-null  int64  
 10  Contract          18979 non-null  object 
 11  Positions         18979 non-null  object 
 12  Height            18979 non-null  float64
 13  Weight            18979 non-null  float64
 14  Preferred Foot    18979 non-null  int64  
 15  BOV               18979 non-null  int64  
 16  Best Position     18979 non-null  int64 

In [19]:
# ENCODING - positions 

# Convert "Positions" column into multiple binary (0/1) columns
df_processed.rename(columns=lambda x: x.strip(), inplace=True)

# Check if 'Positions' exists
if "Positions" in df_processed.columns:
    df_processed['Positions'] = df_processed['Positions'].astype(str)  # Ensure it's a string
    df_processed['Positions'] =   df_processed['Positions'].fillna("")# Fill missing values
    positions_split = df_processed['Positions'].str.get_dummies(sep=', ')
    df_processed = pd.concat([df_processed, positions_split], axis=1)
else:
    print("Column 'Positions' not found in DataFrame")

# Display new dataframe structure
df_processed.info()
print(df_processed.columns)

df_processed.to_csv('newdataidk.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 93 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                18979 non-null  int64  
 1   Name              18979 non-null  object 
 2   LongName          18979 non-null  object 
 3   photoUrl          18979 non-null  object 
 4   playerUrl         18979 non-null  object 
 5   Nationality       18979 non-null  int64  
 6   Age               18979 non-null  int64  
 7   ↓OVA              18979 non-null  int64  
 8   POT               18979 non-null  int64  
 9   Club              18979 non-null  int64  
 10  Contract          18979 non-null  object 
 11  Positions         18979 non-null  object 
 12  Height            18979 non-null  float64
 13  Weight            18979 non-null  float64
 14  Preferred Foot    18979 non-null  int64  
 15  BOV               18979 non-null  int64  
 16  Best Position     18979 non-null  int64 

In [76]:
common_features = ['Age', 'Reactions', 'Composure', 'Strength', 'Height', 'Weight',
    'Wage', 'Contract_Length', 'Club', 'Nationality', 'Hits', 'Preferred Foot', 'Jumping']

goalkeeper_features = ['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']

outfield_features = [ 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'Finishing', 'Dribbling', 'Short Passing',
    'Long Passing', 'Ball Control', 'Acceleration', 'Sprint Speed', 'Agility', 'Balance',
    'Vision', 'Stamina', 'Crossing', 'Curve', 'Shot Power', 'Long Shots',
    'Interceptions', 'Positioning', 'Marking', 'Standing Tackle', 'Sliding Tackle',
    'Volleys', 'FK Accuracy', 'Penalties'] 


# Drop only the columns that exist in the DataFrame
cols_to_drop = ['ID', 'Name', 'LongName', 'photoUrl', 'playerUrl', 'Positions', 'Contract', 'Joined', 'Loan Date End'] 


# Check which columns are actually in the DataFrame before dropping
existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
df = df_processed.drop(columns=existing_cols_to_drop)

# Proceed with defining features
features = common_features + goalkeeper_features + outfield_features

print(features)

# Ensure 'Value' is not included in features
features = list(dict.fromkeys(features))  # removes duplicates while preserving order
if 'Value' in features:
    features.remove('Value')
    

df.info()

#Scaling the features
scaler = MinMaxScaler()
df_scaled = df_processed.copy()
df_scaled[features] = scaler.fit_transform(df_scaled[features])
target = 'Value'
df_scaled = df_scaled[features + [target]].dropna() 

expected_cols = common_features + outfield_features + [target]
missing_cols = [col for col in expected_cols if col not in df_scaled.columns]
print("Missing columns:", missing_cols)

['Age', 'Reactions', 'Composure', 'Strength', 'Height', 'Weight', 'Wage', 'Contract_Length', 'Club', 'Nationality', 'Hits', 'Preferred Foot', 'Jumping', 'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'Finishing', 'Dribbling', 'Short Passing', 'Long Passing', 'Ball Control', 'Acceleration', 'Sprint Speed', 'Agility', 'Balance', 'Vision', 'Stamina', 'Crossing', 'Curve', 'Shot Power', 'Long Shots', 'Interceptions', 'Positioning', 'Marking', 'Standing Tackle', 'Sliding Tackle', 'Volleys', 'FK Accuracy', 'Penalties']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 93 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                18979 non-null  int64  
 1   Name              18979 non-null  object 
 2   LongName          18979 non-null  object 
 3   photoUrl          18979 non-null  object 
 4   playerUrl       

In [79]:
#goalkeeper model 

gk_df = df[df['GK'] == 1].copy()
gk_df = gk_df[common_features + goalkeeper_features + [target]].dropna()


X = gk_df[common_features + goalkeeper_features]
y = gk_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# classification maybe with ova??????
goalkeeper_feature_importance_df = pd.DataFrame({
    
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(goalkeeper_feature_importance_df)


Mean Squared Error: 2165614320475.83
Root Mean Squared Error: 1471602.64
R² Score: 0.92

Feature Importance:
            Feature  Importance
17      GK Reflexes    0.316347
13        GK Diving    0.159806
1         Reactions    0.110624
6              Wage    0.107059
14      GK Handling    0.085587
16   GK Positioning    0.066265
0               Age    0.054488
10             Hits    0.046278
4            Height    0.009497
2         Composure    0.008675
12          Jumping    0.006772
7   Contract_Length    0.006127
8              Club    0.004873
15       GK Kicking    0.004748
3          Strength    0.004523
5            Weight    0.004375
9       Nationality    0.003470
11   Preferred Foot    0.000486


In [81]:
# #outfield model 
# df.info()

# # df_scaled = df.copy()
# # df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# df_scaled = df_scaled[common_features + outfield_features + [target]].dropna()

# X = df_scaled[common_features + outfield_features]
# y = df_scaled[target]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error: {mse:.2f}")
# print(f"Root Mean Squared Error: {rmse:.2f}")
# print(f"R² Score: {r2:.2f}")

# # classification maybe with ova??????
# importances = model.feature_importances_
# feature_importance_df = pd.DataFrame({
#     'Feature': features,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)

# print("\nFeature Importance:")
# print(feature_importance_df)
player_df = df[df['GK'] == 0].copy()
player_df = player_df[common_features + goalkeeper_features + [target]].dropna()


X = player_df[common_features + goalkeeper_features]
y = player_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

player_model = RandomForestRegressor(random_state=42)
player_model.fit(X_train, y_train)

y_pred = player_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# classification maybe with ova??????
player_feature_importance_df = pd.DataFrame({
    
    'Feature': X.columns,
    'Importance': player_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(player_feature_importance_df)

Mean Squared Error: 7729956605800.81
Root Mean Squared Error: 2780279.95
R² Score: 0.89

Feature Importance:
            Feature  Importance
6              Wage    0.521534
1         Reactions    0.221649
10             Hits    0.136703
0               Age    0.032752
2         Composure    0.012064
7   Contract_Length    0.008710
9       Nationality    0.008100
3          Strength    0.007957
14      GK Handling    0.006992
13        GK Diving    0.006489
8              Club    0.006254
5            Weight    0.005827
12          Jumping    0.005726
4            Height    0.005122
17      GK Reflexes    0.004502
16   GK Positioning    0.004197
15       GK Kicking    0.004076
11   Preferred Foot    0.001347
