## Decision Tree Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss, f1_score, roc_auc_score

def decision_tree():
    # Read the data
    df = pd.read_csv('pca_transformed_data.csv')
    
    # Split into X (features) and y (result)
    X = df.drop('result', axis=1)  
    y = df['result']
    
    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
    
    # Make the model
    model = DecisionTreeClassifier(random_state=10)
    
    # Fit training data to model
    model.fit(X_train, y_train)
    
    # Make model predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    l_loss = log_loss(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    

    # Print results
    print("Model Performance:")
    print(f"Log Loss: {l_loss:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    
    # See what are the most important features
    feature_importances = model.feature_importances_

    importance = pd.DataFrame({
        'Component': X.columns,
        'Feature_Importance': feature_importances
    })
    importance = importance.sort_values('Feature_Importance', ascending=False)

    print("\nComponent Importance:")
    print(importance)


decision_tree()

Model Performance:
Log Loss: 2.8248
F1 Score: 0.9184
ROC AUC Score: 0.9214

Component Importance:
                               Component  Feature_Importance
0              Gold_Advantage_and_Towers            0.728783
3                       Grubs_and_Herald            0.145236
6   Laning_Phase_and_Early_Baron_Control            0.026301
2                      Teamfight_Metrics            0.022985
1                         Vision_Control            0.017847
8             Late_Game_Elder_Teamfights            0.017485
7  Early_to_Late_Game_Objective_Sequence            0.014904
4                         Herald_Control            0.013612
5              Monster_Objective_Control            0.012846


# Bagged Decision Tree with Grid Search

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, f1_score, roc_auc_score

def bagged_decision_tree():
    # Read the data
    df = pd.read_csv('pca_transformed_data.csv')

    # Split into X (features) and y (result)
    X = df.drop('result', axis=1)  
    y = df['result']

    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Make the model
    base_estimator = DecisionTreeClassifier(random_state=10)
    model = BaggingClassifier(estimator=base_estimator, random_state=10)
    # Fit training data to model
    model.fit(X_train, y_train)

    # Make model predictions
    y_pred = model.predict(X_test)
    
    # Parameter grid for grid search
    param_grid = {
        'n_estimators': [10, 50, 75, 100, 125, 150],
        'max_samples': [0.5, 0.7, 1.0],
        'max_features': [0.5, 0.7, 1.0]
    }
    
    # Grid search object
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='f1'
    )
    
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Print best parameters
    print("\nBest parameters:")
    print(grid_search.best_params_)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    l_loss = log_loss(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    

    # Print results
    print("Model Performance:")
    print(f"Log Loss: {l_loss:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    
    # Feature importance using the average of all the trees
    n_features = X.shape[1]
    importances = np.zeros((len(best_model.estimators_), n_features))
    
    for i, tree in enumerate(best_model.estimators_):
        # Get the indices of features used in this tree
        feature_mask = best_model.estimators_features_[i]
        # Fill in the importances for the features that were used
        importances[i, feature_mask] = tree.feature_importances_
    
    # Average importance across all trees
    feature_importances = np.mean(importances, axis=0)
    
    # Look at which components matter most
    importance = pd.DataFrame({
        'Component': X.columns,
        'Feature_Importance': feature_importances
    })
    importance = importance.sort_values('Feature_Importance', ascending=False)
    
    print("\nComponent Importance (averaged across all trees):")
    print(importance)



bagged_decision_tree()


Best parameters:
{'max_features': 0.7, 'max_samples': 1.0, 'n_estimators': 75}
Model Performance:
Log Loss: 1.8976
F1 Score: 0.9456
ROC AUC Score: 0.9475

Component Importance (averaged across all trees):
                               Component  Feature_Importance
0              Gold_Advantage_and_Towers            0.489616
3                       Grubs_and_Herald            0.154653
4                         Herald_Control            0.084595
5              Monster_Objective_Control            0.059725
1                         Vision_Control            0.050559
2                      Teamfight_Metrics            0.048316
8             Late_Game_Elder_Teamfights            0.042991
6   Laning_Phase_and_Early_Baron_Control            0.039271
7  Early_to_Late_Game_Objective_Sequence            0.030275


# Random Forest

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, f1_score, roc_auc_score

def random_forest():
    # Read the data
    df = pd.read_csv('pca_transformed_data.csv')

    # Split into X (features) and y (result)
    X = df.drop('result', axis=1)  
    y = df['result']

    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Make the model
    model = RandomForestClassifier(random_state=10)
    # Fit training data to model
    model.fit(X_train, y_train)

    # Make model predictions
    y_pred = model.predict(X_test)
    
    # Parameter grid for grid search
    param_grid = {
        'n_estimators': [10, 50, 75, 100, 125, 150],
        'max_depth': [3, 5, 10, 20]
    }
    
    # Grid search object
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='f1'  # Use F1 score for evaluation
    )
    
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Print best parameters
    print("\nBest parameters:")
    print(grid_search.best_params_)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    l_loss = log_loss(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    

    # Print results
    print("Model Performance:")
    print(f"Log Loss: {l_loss:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    
    # Feature importance using the average of all the trees
    feature_importances = np.mean([tree.feature_importances_ for tree in best_model.estimators_], axis=0)
    
    # Look at which components matter most
    importance = pd.DataFrame({
        'Component': X.columns,
        'Feature_Importance': feature_importances
    })
    importance = importance.sort_values('Feature_Importance', ascending=False)
    
    print("\nComponent Importance (averaged across all trees):")
    print(importance)



random_forest()


Best parameters:
{'max_depth': 10, 'n_estimators': 100}
Model Performance:
Log Loss: 1.9084
F1 Score: 0.9455
ROC AUC Score: 0.9473

Component Importance (averaged across all trees):
                               Component  Feature_Importance
0              Gold_Advantage_and_Towers            0.697592
3                       Grubs_and_Herald            0.148220
4                         Herald_Control            0.029497
1                         Vision_Control            0.029425
2                      Teamfight_Metrics            0.023879
5              Monster_Objective_Control            0.020990
6   Laning_Phase_and_Early_Baron_Control            0.019823
8             Late_Game_Elder_Teamfights            0.017205
7  Early_to_Late_Game_Objective_Sequence            0.013368
