Install Dependencies

In [None]:
%pip install -r requirements.txt

Import Libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, auc, classification_report, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier


Load Dataset

In [None]:
# Load the processed dataset
df = pd.read_csv("data/processed_dataset.csv", index_col=0)
print("Dataset loaded successfully from 'data/processed_dataset.csv'")

df.head()

Train-test Split

In [None]:
X = df.drop(columns=["blueWin"])
y = df["blueWin"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Scaling

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Feature Selection

In [None]:
def find_bestKfeatures(model):
  print("Performing feature selection...")
  # Select best k features
  k = -1
  max_score = 0
  for i in range(1, 16, 2):
    selector = SelectKBest(k=i)
    pipeline = Pipeline([('selector', selector), ('model', model)])
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print("K: {}, score: {}".format(i, score))
    if score > max_score:
      k = i
      max_score = score
      selected_features_indices = selector.get_support(indices=True)
  print("Best K number: {}, score: {}".format(k, max_score))
  print("Selected features: ", list(X.columns[selected_features_indices]))
  print()
  # return feature index list
  return list(selected_features_indices)

def apply_feature_selection(X, selected_features_indices):
  return X[:, selected_features_indices]

Hyper-Parameter Tuning

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)

def train_model_with_random_search(model, param_grid, X_train, y_train):
    print("Starting training...")

    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, 
                                       scoring='accuracy', n_jobs=-1, cv=cv)

    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)

    print("Training completed")
    print()

    # Get best estimator
    best_model = random_search.best_estimator_
    # Get best training score
    best_score = random_search.best_score_
    print('Best Training Score: ', best_score)
    # Get best param
    best_param = random_search.best_params_
    print('Best Parameters: ', best_param)
    print()

    # Return model
    return best_model, best_param

Model Evaluation

In [None]:
def evaluate_model(model, X_test):
    print("Model Evaluation: ")

    # Predict on test set
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    print('Accuracy: ', accuracy)
    print('F1 Score: ', f1)
    print('AUC(ROC): ', roc_auc)
    print()
    print("Classification Report: ")
    print(classification_report(y_test, y_pred))

    # ROC AUC plot
    fper, tper, _ = roc_curve(y_test, y_pred_prob)

    fig, ax = plt.subplots(figsize=(5, 5))
    ax.plot(fper, tper, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver Operating Characteristic (ROC) Curve')
    ax.legend(loc="lower right")

    plt.show()

    return accuracy, f1, roc_auc

Random Forest Classifier

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini'],
    'max_depth': [None, 3, 4, 5, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

model = RandomForestClassifier()

# Perform feature selection
selected_features = find_bestKfeatures(model)
X_train_rd, X_test_rd = apply_feature_selection(X_train, selected_features), apply_feature_selection(X_test, selected_features)

# Train model with RandomizedSearchCV
trained_model, param_sample = train_model_with_random_search(model, param_grid, X_train_rd, y_train)

# Evaluate model
accuracy, f1, roc_auc = evaluate_model(trained_model, X_test_rd)