In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Use regressor instead of classifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the pre-processed merged data
data = pd.read_csv('../data/player_data/merged_player_data.csv')

# Feature Engineering: Create missing features
data['Assists_Scaled'] = data['ASSISTS'] / data['GAMES']  # Example scaling
data['Assists_Per_90'] = data['ASSISTS'] / (data['GAMES'] * 90)  # Assists per 90 minutes

# Define features (X) and target variable (y)
X = data[['Assists_Scaled', 'Completed Pass', 'GOALS SCORED', 'Assists_Per_90']]  # Updated features
y = data['GOALS SCORED']  # Predict the exact number of goals scored

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply scaling on the training data to avoid data leakage
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train[['Assists_Scaled', 'Completed Pass']] = scaler.fit_transform(X_train[['Assists_Scaled', 'Completed Pass']])

# Transform the test data using the same scaler
X_test[['Assists_Scaled', 'Completed Pass']] = scaler.transform(X_test[['Assists_Scaled', 'Completed Pass']])

# Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")

# Save the trained model for future use
with open('../models/best_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


Mean Absolute Error: 0.00
Mean Squared Error: 0.00
