In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Load the dataset
haaland_shots_data = pd.read_csv('../data/raw/haaland_shot_map.csv')
print(haaland_shots_data.shape)

# Select relevant features and target
features = ['X', 'Y', 'shotType', 'lastAction', 'situation']
target = 'xG'
data = haaland_shots_data[features + [target]].dropna()

# Preprocess data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['X', 'Y']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['shotType', 'lastAction', 'situation'])
    ]
)
X = data[features]
y = data[target]
X_processed = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
}

# Train and evaluate traditional models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MSE": mse, "R²": r2})

# Add neural network model
input_dim = X_train.shape[1]

# Define Neural Network architecture
nn_model = Sequential([
    Dense(64, input_dim=input_dim, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='linear')
])

# Compile the neural network
nn_model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Train the neural network for 50 epochs
history = nn_model.fit(
    X_train,
    y_train,
    epochs=50,  # Change to 60 for 60 epochs
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the neural network
y_pred_nn = nn_model.predict(X_test, batch_size=32).flatten()
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

# Add neural network results
results.append({"Model": "Neural Network", "MSE": mse_nn, "R²": r2_nn})

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)
results_df.to_csv("../data/model/model_comparison_results.csv")
print(results_df)



(522, 20)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - loss: 0.1578 - mse: 0.1578 - val_loss: 0.0576 - val_mse: 0.0576
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.1152 - mse: 0.1152 - val_loss: 0.0413 - val_mse: 0.0413
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0907 - mse: 0.0907 - val_loss: 0.0339 - val_mse: 0.0339
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.0650 - mse: 0.0650 - val_loss: 0.0280 - val_mse: 0.0280
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.0573 - mse: 0.0573 - val_loss: 0.0249 - val_mse: 0.0249
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0488 - mse: 0.0488 - val_loss: 0.0222 - val_mse: 0.0222
Epoch 7/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0