In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

# Load the merged dataset
merged_data = pd.read_csv('../data/player_data/merged_player_data.csv')

# Feature engineering
# 1. Goals per 90 minutes (already in 'GOALS PER MATCH')
# 2. Assists per 90 minutes (already in 'ASSISTS PER MATCH')
# 3. Pass Completion Rate
merged_data['Pass_Completion_Rate'] = merged_data['Completed Pass'] / merged_data['Total Pass'] * 100

# 4. Goals per 90 minutes (derived from 'GOALS SCORED' and 'GAMES')
merged_data['Goals_Per_90'] = merged_data['GOALS SCORED'] / merged_data['GAMES'] * 90

# 5. Scaled assists (Standardization of 'ASSISTS PER MATCH')
scaler = StandardScaler()
merged_data['Assists_Scaled'] = scaler.fit_transform(merged_data[['ASSISTS PER MATCH']])

# Handle missing values: Using mean imputation for numeric columns
merged_data.fillna(merged_data.mean(numeric_only=True), inplace=True)

# Ensure the 'data' directory exists before saving the file
if not os.path.exists('data'):
    os.makedirs('data')

# Save the engineered data for modeling
merged_data.to_csv('data/merged_player_data_engineered.csv', index=False)

# Display the engineered features
print("Engineered Data Head:")
print(merged_data.head())


Engineered Data Head:
   Position                 NAME           TEAM  GOALS SCORED  GAMES  \
0       250  AINGERU OLABARRIETA  ATHLETIC CLUB             0      0   
1       145        AITOR PAREDES  ATHLETIC CLUB             1     28   
2        27       ALEX BERENGUER  ATHLETIC CLUB             6     27   
3       250        ANDER HERRERA  ATHLETIC CLUB             0     17   
4       145     ASIER VILLALIBRE  ATHLETIC CLUB             1     13   

   GOALS PER MATCH  Assists_Position  ASSISTS  ASSISTS PER MATCH  \
0             0.00               239        0               0.00   
1             0.04               239        0               0.00   
2             0.22                74        2               0.07   
3             0.00                74        2               0.12   
4             0.08               137        1               0.08   

   Pass_Position  Completed Pass  Total Pass  Pass_Completion_Rate  \
0            566               0           0             78.370373