### **ML Project- Position prediction for football players**

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,id,short_name,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,club_name,...,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,position
0,216302,E. García,71,71,1400000.0,10000,1989-12-28,176,73,Club Atlético de San Luis,...,65,66,65,14,11,12,12,12,,LB
1,237867,D. Cancola,65,71,1000000.0,2000,1996-10-23,183,73,Ross County FC,...,65,61,58,10,13,7,6,11,,LDM
2,253472,E. Kahl,65,77,1600000.0,2000,2001-09-27,178,69,Aarhus GF,...,60,58,59,10,10,8,10,11,,LWB
3,223994,S. Mugoša,72,72,2300000.0,5000,1992-02-26,188,81,Incheon United FC,...,16,22,19,16,15,13,8,9,,LS
4,251635,A. Țigănașu,65,65,525000.0,3000,1990-06-12,179,74,FC Botoşani,...,64,61,58,12,5,11,12,15,,LB


In [4]:
df_train.position.value_counts()

position
RCB    631
GK     631
LCB    631
RB     463
LB     463
ST     428
RCM    423
LCM    423
RM     369
LM     369
CAM    263
RDM    201
LS     201
RS     201
LDM    201
CB     167
RW     166
LW     166
CDM    151
LWB    104
RWB    104
CM      75
LF      32
RF      32
Name: count, dtype: int64

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6895 entries, 0 to 6894
Data columns (total 70 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           6895 non-null   int64  
 1   short_name                   6895 non-null   object 
 2   overall                      6895 non-null   int64  
 3   potential                    6895 non-null   int64  
 4   value_eur                    6893 non-null   float64
 5   wage_eur                     6895 non-null   int64  
 6   birthday_date                6895 non-null   object 
 7   height_cm                    6895 non-null   int64  
 8   weight_kg                    6895 non-null   int64  
 9   club_name                    6895 non-null   object 
 10  league_name                  6895 non-null   object 
 11  league_level                 6895 non-null   int64  
 12  club_jersey_number           6895 non-null   int64  
 13  club_loaned_from  

## Categorical Data Preprocessing

In [6]:
df_train.select_dtypes(include=['object', 'category']).columns

Index(['short_name', 'birthday_date', 'club_name', 'league_name',
       'club_loaned_from', 'club_joined', 'nationality_name', 'preferred_foot',
       'work_rate', 'body_type', 'real_face', 'player_tags', 'player_traits',
       'position'],
      dtype='object')

In [7]:
duplicates = df_train[df_train.duplicated()]

In [8]:
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

In [9]:
df_train.club_joined.dtypes

dtype('O')

In [10]:
df_train.select_dtypes(include=['object', 'category']).columns

Index(['short_name', 'birthday_date', 'club_name', 'league_name',
       'club_loaned_from', 'club_joined', 'nationality_name', 'preferred_foot',
       'work_rate', 'body_type', 'real_face', 'player_tags', 'player_traits',
       'position'],
      dtype='object')

In [11]:
df_train['birthday_date'] = pd.to_datetime(df_train['birthday_date']).dt.date
df_test['birthday_date'] = pd.to_datetime(df_test['birthday_date']).dt.date

In [12]:
# df_train['club_joined'] = pd.to_datetime(df_train['club_joined']).dt.date

In [13]:
# df_train['club_contract_valid_until'] = pd.to_datetime(df_train['club_contract_valid_until'])
df_train['club_joined'] = pd.to_datetime(df_train['club_joined'])
df_test['club_joined'] = pd.to_datetime(df_test['club_joined'])

In [14]:
# df_train['club_contract_valid_until'].dt.year

In [15]:
# Calculate the playing experience (in days, years, or months)
df_train['playing_experience'] = df_train['club_contract_valid_until'] - df_train['club_joined'].dt.year
df_test['playing_experience'] = df_test['club_contract_valid_until'] - df_test['club_joined'].dt.year

In [16]:
df_train['playing_experience']

0       5.0
1       1.0
2       5.0
3       5.0
4       3.0
       ... 
6890    2.0
6891    3.0
6892    3.0
6893    7.0
6894    1.0
Name: playing_experience, Length: 6895, dtype: float64

In [17]:
# Calculate age
from datetime import datetime

today = pd.Timestamp(datetime.today())
df_train['Age'] = df_train['birthday_date'].apply(lambda dob: today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day)))
df_test['Age'] = df_test['birthday_date'].apply(lambda dob: today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day)))

In [18]:
# Split the 'work_rate' column into two new columns
df_train[['attacking_work_rate', 'defensive_work_rate']] = df_train['work_rate'].str.split('/', expand=True)
df_test[['attacking_work_rate', 'defensive_work_rate']] = df_test['work_rate'].str.split('/', expand=True)

# Now check the first few rows to verify
print(df_train[['attacking_work_rate', 'defensive_work_rate']].head())

  attacking_work_rate defensive_work_rate
0              Medium                High
1              Medium              Medium
2                High              Medium
3                High              Medium
4              Medium                High


In [19]:
work_rate_map = {'Low': 1, 'Medium': 2, 'High': 3}

# Replace the values in both 'attacking_work_rate' and 'defensive_work_rate' columns
df_train['attacking_work_rate'] = df_train['attacking_work_rate'].replace(work_rate_map)
df_train['defensive_work_rate'] = df_train['defensive_work_rate'].replace(work_rate_map)

# Replace the values in both 'attacking_work_rate' and 'defensive_work_rate' columns
df_test['attacking_work_rate'] = df_test['attacking_work_rate'].replace(work_rate_map)
df_test['defensive_work_rate'] = df_test['defensive_work_rate'].replace(work_rate_map)

  df_train['attacking_work_rate'] = df_train['attacking_work_rate'].replace(work_rate_map)
  df_train['defensive_work_rate'] = df_train['defensive_work_rate'].replace(work_rate_map)
  df_test['attacking_work_rate'] = df_test['attacking_work_rate'].replace(work_rate_map)
  df_test['defensive_work_rate'] = df_test['defensive_work_rate'].replace(work_rate_map)


In [20]:
print(df_train[['attacking_work_rate', 'defensive_work_rate']].head())

   attacking_work_rate  defensive_work_rate
0                    2                    3
1                    2                    2
2                    3                    2
3                    3                    2
4                    2                    3


In [21]:
df_train.drop(['goalkeeping_speed', 'player_tags', 'nation_jersey_number', 'club_loaned_from', 'club_contract_valid_until', 'birthday_date', 'club_joined',
              'work_rate','short_name', 'nationality_name', 'league_name', 'nation_jersey_number', 'player_traits', 'club_name' ], axis=1, inplace=True)
df_test.drop(['goalkeeping_speed', 'player_tags', 'nation_jersey_number', 'club_loaned_from', 'club_contract_valid_until', 'club_joined', 'birthday_date',
              'work_rate', 'short_name', 'nationality_name', 'league_name', 'nation_jersey_number', 'player_traits', 'club_name'
             ], axis=1, inplace=True)


In [22]:
df_train.columns

Index(['id', 'overall', 'potential', 'value_eur', 'wage_eur', 'height_cm',
       'weight_kg', 'league_level', 'club_jersey_number', 'preferred_foot',
       'weak_foot', 'skill_moves', 'international_reputation', 'body_type',
       'real_face', 'release_clause_eur', 'pace', 'shooting', 'passing',
       'dribbling', 'defending', 'physic', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awarene

In [23]:
categorical_columns = ['preferred_foot', 'body_type', 'real_face']
df_train = pd.get_dummies(df_train, columns=categorical_columns)
df_test = pd.get_dummies(df_test, columns=categorical_columns)

In [24]:
print(df_train.columns)


Index(['id', 'overall', 'potential', 'value_eur', 'wage_eur', 'height_cm',
       'weight_kg', 'league_level', 'club_jersey_number', 'weak_foot',
       'skill_moves', 'international_reputation', 'release_clause_eur', 'pace',
       'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'de

In [25]:
df_train.columns[df_train.isnull().any()]

Index(['value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing',
       'dribbling', 'defending', 'physic', 'playing_experience'],
      dtype='object')

In [26]:
# List of columns to check
columns_to_check = ['value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing',
                    'dribbling', 'defending', 'physic', 'playing_experience']

# Calculate the percentage of missing values for each specified column
missing_percentage = df_train[columns_to_check].isnull().mean() * 100

# Display the results
print(missing_percentage)

value_eur             0.029007
release_clause_eur    5.119652
pace                  9.151559
shooting              9.151559
passing               9.151559
dribbling             9.151559
defending             9.151559
physic                9.151559
playing_experience    5.090645
dtype: float64


In [27]:
# List of columns to check
columns_to_check = ['value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing',
                    'dribbling', 'defending', 'physic', 'playing_experience']

# Calculate the percentage of missing values for each specified column
missing_percentage = df_train[columns_to_check].describe()

# Display the results
print(missing_percentage)

          value_eur  release_clause_eur         pace     shooting  \
count  6.893000e+03        6.542000e+03  6264.000000  6264.000000   
mean   4.320388e+06        8.148991e+06    68.535121    54.300128   
std    1.052554e+07        2.056057e+07    11.777003    14.386234   
min    2.500000e+04        3.900000e+04    28.000000    18.000000   
25%    7.250000e+05        1.100000e+06    62.000000    43.000000   
50%    1.400000e+06        2.300000e+06    70.000000    57.000000   
75%    2.900000e+06        5.400000e+06    77.000000    65.000000   
max    1.940000e+08        3.735000e+08    97.000000    94.000000   

           passing    dribbling    defending       physic  playing_experience  
count  6264.000000  6264.000000  6264.000000  6264.000000         6544.000000  
mean     60.068487    64.570243    55.549968    68.293582            3.677262  
std       9.603023     9.380977    15.616228     8.548078            2.446817  
min      25.000000    29.000000    16.000000    31.000000 

In [28]:
columns_m = ['value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing',
                    'dribbling', 'defending', 'physic', 'playing_experience']

# Fill missing values in these columns with the mean of each column
for col in columns_m:
    df_train[col].fillna(df_train[col].mean(), inplace=True)
    df_test[col].fillna(df_test[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(df_test[col].mean(), inplace=True)


In [29]:
df_train.shape

(6895, 72)

In [30]:
# df_train.dropna( axis=0, inplace=True)
# df_test.dropna( axis=0, inplace=True)

In [31]:
df_test.shape

(767, 71)

In [32]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6895 entries, 0 to 6894
Data columns (total 72 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           6895 non-null   int64  
 1   overall                      6895 non-null   int64  
 2   potential                    6895 non-null   int64  
 3   value_eur                    6895 non-null   float64
 4   wage_eur                     6895 non-null   int64  
 5   height_cm                    6895 non-null   int64  
 6   weight_kg                    6895 non-null   int64  
 7   league_level                 6895 non-null   int64  
 8   club_jersey_number           6895 non-null   int64  
 9   weak_foot                    6895 non-null   int64  
 10  skill_moves                  6895 non-null   int64  
 11  international_reputation     6895 non-null   int64  
 12  release_clause_eur           6895 non-null   float64
 13  pace              

In [33]:
df_train.preferred_foot_Left

0       False
1       False
2        True
3       False
4        True
        ...  
6890    False
6891    False
6892     True
6893     True
6894    False
Name: preferred_foot_Left, Length: 6895, dtype: bool

In [34]:
X_train = df_train.drop('position', axis=1)  # Features
y_train = df_train['position']              # Target

In [35]:
X_train.drop(columns='id', inplace=True)
df_test1 = df_test.drop(columns='id')
X_train

Unnamed: 0,overall,potential,value_eur,wage_eur,height_cm,weight_kg,league_level,club_jersey_number,weak_foot,skill_moves,...,body_type_Lean (185+),body_type_Normal (170-),body_type_Normal (170-185),body_type_Normal (185+),body_type_Stocky (170-),body_type_Stocky (170-185),body_type_Stocky (185+),body_type_Unique,real_face_No,real_face_Yes
0,71,71,1400000.0,10000,176,73,1,29,5,3,...,False,False,True,False,False,False,False,False,True,False
1,65,71,1000000.0,2000,183,73,1,4,3,2,...,False,False,False,False,False,False,False,False,True,False
2,65,77,1600000.0,2000,178,69,1,19,3,3,...,False,False,True,False,False,False,False,False,True,False
3,72,72,2300000.0,5000,188,81,1,9,3,3,...,True,False,False,False,False,False,False,False,True,False
4,65,65,525000.0,3000,179,74,1,30,2,2,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6890,59,66,425000.0,2000,180,65,4,17,3,2,...,False,False,True,False,False,False,False,False,True,False
6891,83,83,28500000.0,61000,183,80,1,15,4,4,...,False,False,True,False,False,False,False,False,False,True
6892,70,80,3400000.0,6000,177,73,1,25,3,2,...,False,False,True,False,False,False,False,False,True,False
6893,71,71,1300000.0,7000,176,70,1,19,3,2,...,False,False,False,False,False,False,False,False,True,False


In [36]:
X1_test= df_test1

We are using SMOTE (Synthetic Minority Oversampling Technique) to deal with the data imbalance. It's used to address class imbalance in classification problems (works by creating synthetic samples for the minority class rather than duplicating them). 

In [37]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_smote.value_counts())


Before SMOTE: position
RCB    631
GK     631
LCB    631
RB     463
LB     463
ST     428
RCM    423
LCM    423
RM     369
LM     369
CAM    263
RDM    201
LS     201
RS     201
LDM    201
CB     167
RW     166
LW     166
CDM    151
LWB    104
RWB    104
CM      75
LF      32
RF      32
Name: count, dtype: int64
After SMOTE: position
LB     631
LDM    631
RF     631
LF     631
RWB    631
LM     631
RS     631
ST     631
CB     631
CAM    631
RDM    631
CDM    631
RCB    631
RB     631
LCB    631
LW     631
RCM    631
RM     631
LCM    631
GK     631
RW     631
LS     631
LWB    631
CM     631
Name: count, dtype: int64


In [38]:
X_train_smote.head()

Unnamed: 0,overall,potential,value_eur,wage_eur,height_cm,weight_kg,league_level,club_jersey_number,weak_foot,skill_moves,...,body_type_Lean (185+),body_type_Normal (170-),body_type_Normal (170-185),body_type_Normal (185+),body_type_Stocky (170-),body_type_Stocky (170-185),body_type_Stocky (185+),body_type_Unique,real_face_No,real_face_Yes
0,71,71,1400000.0,10000,176,73,1,29,5,3,...,False,False,True,False,False,False,False,False,True,False
1,65,71,1000000.0,2000,183,73,1,4,3,2,...,False,False,False,False,False,False,False,False,True,False
2,65,77,1600000.0,2000,178,69,1,19,3,3,...,False,False,True,False,False,False,False,False,True,False
3,72,72,2300000.0,5000,188,81,1,9,3,3,...,True,False,False,False,False,False,False,False,True,False
4,65,65,525000.0,3000,179,74,1,30,2,2,...,False,False,True,False,False,False,False,False,True,False


In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
# Splitting the data into training and testing sets (80% training, 20% testing)
# X_train, X_test, y_train, y_test = train_test_split(X_train_smote, y_train_smote, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Fit the model
rf_model.fit(X_train_smote, y_train_smote)



# # Calculate F1 score
# f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' handles multiclass imbalance
# print(f'F1 Score: {f1:.2f}')

In [39]:
y_pred = rf_model.predict(X1_test)

In [40]:
predictions_df = pd.DataFrame({
    'id': df_test['id'],  
    'position': y_pred        # 'y_pred' contains the model predictions
})

# Export the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("CSV file with id and predictions has been saved.")

CSV file with id and predictions has been saved.
