In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [16]:
# Load dataset
file_path = 'match_result.csv'
data = pd.read_csv(file_path)

In [17]:
# Data Preprocessing
categorical_features = ['round', 'day', 'venue', 'opponent', 'team']
data[categorical_features] = data[categorical_features].apply(lambda x: LabelEncoder().fit_transform(x))


In [18]:
# Target variable encoding
y = LabelEncoder().fit_transform(data['result'])
X = data.drop(columns=['result', 'date'])

In [19]:
# Feature Engineering for Recent Form
def add_recent_form_features(data):
    data = data.copy()
    data['recent_wins'] = data['gf'].rolling(window=5).apply(lambda x: sum(x > 0)).fillna(0)
    data['recent_losses'] = data['ga'].rolling(window=5).apply(lambda x: sum(x > 0)).fillna(0)
    return data


In [20]:
X = add_recent_form_features(X)

In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# Custom Hybrid Ensemble with Adaptive Boosting
class HybridEnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        # Base models
        self.rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        self.gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
        self.nn = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=42)

    def fit(self, X, y):
        # Train each model
        self.rf.fit(X, y)
        self.gb.fit(X, y)
        self.nn.fit(X, y)

        # Adaptive weights based on model performance
        self.rf_weight = 0.5
        self.gb_weight = 0.3
        self.nn_weight = 0.2

        return self

    def predict(self, X):
        # Weighted sum of predictions
        rf_pred = self.rf.predict(X)
        gb_pred = self.gb.predict(X)
        nn_pred = self.nn.predict(X)

        final_pred = (self.rf_weight * rf_pred +
                      self.gb_weight * gb_pred +
                      self.nn_weight * nn_pred).round().astype(int)

        return final_pred

    def predict_proba(self, X):
        # Weighted sum of predicted probabilities
        rf_proba = self.rf.predict_proba(X)
        gb_proba = self.gb.predict_proba(X)
        nn_proba = self.nn.predict_proba(X)

        final_proba = (self.rf_weight * rf_proba +
                       self.gb_weight * gb_proba +
                       self.nn_weight * nn_proba)

        return final_proba

In [23]:
# Define pipeline
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', HybridEnsembleClassifier())
])

In [24]:
# Train the model
pipeline.fit(X_train, y_train)



In [25]:
# Predict on test data
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [26]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

In [27]:
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Log Loss: {logloss:.2f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)

Accuracy: 0.90
F1 Score: 0.90
Log Loss: 0.20

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.61      0.75        72
           1       0.80      0.98      0.88       119
           2       0.99      0.99      0.99       133

    accuracy                           0.90       324
   macro avg       0.92      0.86      0.88       324
weighted avg       0.92      0.90      0.90       324


Confusion Matrix:
 [[ 44  28   0]
 [  1 117   1]
 [  0   1 132]]
