In [1]:
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import LabelEncoder
from sklearn.base import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
dataset_path = "../data/jogosLoL2021.csv"
df = pd.read_csv(dataset_path, sep=',')
df.head()

Unnamed: 0,id,result,golddiffat15,xpdiffat15,csdiffat15,killsdiffat15,assistsdiffat15,golddiffat10,xpdiffat10,csdiffat10,...,OPP_EGR,OPP_MLR,OPP_FB%,OPP_FT%,OPP_F3T%,OPP_HLD%,OPP_DRG%,OPP_BN%,OPP_LNE%,OPP_JNG%
0,10,1,5018.0,4255.0,86.0,5.0,9.0,1793.0,2365.0,65.0,...,23.1,-23.1,0,0,33,50,27,0,49.2,43.7
1,22,0,573.0,-1879.0,-49.0,1.0,4.0,759.0,171.0,-8.0,...,77.2,22.8,100,100,100,58,70,89,50.4,53.3
2,34,0,-579.0,-1643.0,-40.0,-1.0,-5.0,73.0,-1.0,-24.0,...,77.2,22.8,100,100,100,58,70,89,50.4,53.3
3,106,1,3739.0,1118.0,53.0,1.0,0.0,1746.0,824.0,21.0,...,63.9,-3.9,67,67,67,48,60,48,51.6,50.3
4,118,0,-6390.0,-4569.0,-47.0,-10.0,-17.0,-3500.0,-1882.0,-18.0,...,25.8,-0.8,13,25,25,19,20,20,49.7,42.2


In [3]:
df.drop(columns="id", inplace=True)
df.dropna(inplace=True)

In [4]:
df.describe()

Unnamed: 0,result,golddiffat15,xpdiffat15,csdiffat15,killsdiffat15,assistsdiffat15,golddiffat10,xpdiffat10,csdiffat10,killsdiffat10,...,OPP_EGR,OPP_MLR,OPP_FB%,OPP_FT%,OPP_F3T%,OPP_HLD%,OPP_DRG%,OPP_BN%,OPP_LNE%,OPP_JNG%
count,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,...,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0,8152.0
mean,0.531894,276.489328,4.800417,-0.348135,0.173332,0.311089,102.689892,32.706452,0.719946,0.090898,...,44.814009,-6.222031,48.298945,43.971663,42.174681,45.288027,45.828754,41.54183,49.774583,48.389561
std,0.499012,3092.939045,2164.253849,41.919732,3.769752,7.180593,1520.04259,1138.075319,27.248651,2.371612,...,14.794485,16.023668,20.784422,22.695632,24.110996,18.265473,13.370805,21.333411,1.044415,4.023629
min,0.0,-13908.0,-10397.0,-181.0,-22.0,-34.0,-7632.0,-5829.0,-109.0,-15.0,...,3.1,-59.9,0.0,0.0,0.0,0.0,0.0,0.0,45.3,35.5
25%,0.0,-1685.25,-1344.25,-27.0,-2.0,-4.0,-865.0,-691.25,-17.0,-1.0,...,35.1,-15.2,33.0,30.0,25.0,33.0,39.0,29.0,49.2,45.9
50%,1.0,313.5,26.0,0.0,0.0,0.0,115.0,26.0,1.0,0.0,...,44.8,-5.9,50.0,43.0,43.0,48.0,46.0,43.0,49.7,48.7
75%,1.0,2247.0,1334.25,26.0,2.0,4.0,1037.5,754.0,19.0,1.0,...,53.4,4.1,61.0,59.0,58.0,58.0,54.0,56.0,50.3,50.8
max,1.0,13855.0,11914.0,183.0,21.0,32.0,7043.0,6464.0,137.0,13.0,...,98.1,49.4,100.0,100.0,100.0,100.0,94.0,100.0,55.3,64.8


---

Definição de seeds, variaveis, ...

In [5]:
np.random.seed(42)
TARGET = "result"
PRE_GAME_FEATURES = [
    'WR', 'KD', 'GPR', 'GSPD', 'EGR', 'MLR', 'FB%', 'FT%', 'F3T%', 
    'HLD%', 'DRG%', 'BN%', 'LNE%', 'JNG%',
    'OPP_WR', 'OPP_KD', 'OPP_GPR', 'OPP_GSPD', 'OPP_EGR', 'OPP_MLR',
    'OPP_FB%', 'OPP_FT%', 'OPP_F3T%', 'OPP_HLD%', 'OPP_DRG%', 
    'OPP_BN%', 'OPP_LNE%', 'OPP_JNG%'
]

AT_10M_FEATURES = [
    'golddiffat10',
    'xpdiffat10',
    'csdiffat10',   
    'killsdiffat10',
    'assistsdiffat10'
]

AT_15M_FEATURES = [
    'golddiffat15', 'xpdiffat15', 'csdiffat15', 
    'killsdiffat15', 'assistsdiffat15'
]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
# Example: Move a tensor to the selected device
# tensor = torch.tensor([1.0, 2.0, 3.0]).to(device)

Using device: cuda


In [6]:
# Se há colunas categóricas que precisam ser codificadas:
feature_columns = None   
le = LabelEncoder()

# Separar features e target
if feature_columns is None:
    X = df.drop(columns=[TARGET]).values
    feature_names = df.drop(columns=[TARGET]).columns.tolist()
else:
    X = df[feature_columns].values
    feature_names = feature_columns

y = df[TARGET].values

In [7]:
# Informações do dataset
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
class_distribution = np.bincount(y)

print(f"Dataset carregado: {n_samples} amostras, {n_features} features, {n_classes} classes")
print(f"Distribuição das classes: {dict(zip(range(n_classes), class_distribution))}")
print(f"Features: {feature_names[:5]}{'...' if len(feature_names) > 5 else ''}")

Dataset carregado: 8152 amostras, 38 features, 2 classes
Distribuição das classes: {0: np.int64(3816), 1: np.int64(4336)}
Features: ['golddiffat15', 'xpdiffat15', 'csdiffat15', 'killsdiffat15', 'assistsdiffat15']...


---

## Random Forest

In [None]:
def nested_cv_random_forest(
    df, 
    feature_columns, 
    target_column='result', 
    rf_hyperparameters=None, 
    n_classes=2, 
    n_outer_folds=10, 
    n_inner_folds=4, 
    n_rounds=3, 
    random_state=36854321
):
    if rf_hyperparameters is None:
        rf_hyperparameters = {'n_estimators': [5, 10, 15, 25], 'max_depth': [10, None]}
    print(f"Grid de hiperparâmetros: {np.prod([len(v) for v in rf_hyperparameters.values()])} combinações")

    X = df[feature_columns].values
    y = df[target_column].values

    resultados_completos = defaultdict(list)
    detalhes_rodadas = []

    for rodada in range(n_rounds):
        print(f"\nRODADA {rodada + 1}/{n_rounds}")
        print("-" * 40)

        indices = np.random.permutation(len(X))
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        outer_cv = StratifiedKFold(n_splits=n_outer_folds, shuffle=True, random_state=random_state)
        acuracias_fold_externo = []
        detalhes_folds = []

        for fold_externo, (train_idx, test_idx) in enumerate(outer_cv.split(X_shuffled, y_shuffled)):
            print(f"Fold Externo {fold_externo + 1}/{n_outer_folds}", end=" - ")

            X_train_outer = X_shuffled[train_idx]
            X_test_outer = X_shuffled[test_idx]
            y_train_outer = y_shuffled[train_idx]
            y_test_outer = y_shuffled[test_idx]

            train_dist = np.bincount(y_train_outer, minlength=n_classes)
            test_dist = np.bincount(y_test_outer, minlength=n_classes)

            inner_cv = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=random_state)

            rf = RandomForestClassifier(random_state=random_state, n_jobs=-1)
            grid_search = GridSearchCV(
                rf,
                rf_hyperparameters,
                cv=inner_cv,
                scoring='accuracy',
                n_jobs=-1,
                verbose=0
            ) # Otimiza hiperparâmetros para cada fold externo e usa validação cruzada interna (4 folds) -> perforance melhorada
            
            grid_search.fit(X_train_outer, y_train_outer)
            y_pred = grid_search.predict(X_test_outer)
            acuracia = accuracy_score(y_test_outer, y_pred)
            acuracias_fold_externo.append(acuracia)

            detalhes_fold = {
                'fold': fold_externo + 1,
                'melhores_params': grid_search.best_params_,
                'acuracia_validacao_interna': grid_search.best_score_,
                'acuracia_teste_externo': acuracia,
                'tamanho_treino': len(X_train_outer),
                'tamanho_teste': len(X_test_outer),
                'distribuicao_treino': train_dist.tolist(),
                'distribuicao_teste': test_dist.tolist()
            }
            detalhes_folds.append(detalhes_fold)

            print(f"Acurácia: {acuracia:.4f} | Val.Interna: {grid_search.best_score_:.4f}")

        acuracia_media_rodada = np.mean(acuracias_fold_externo)
        desvio_rodada = np.std(acuracias_fold_externo)

        print(f"\n  Resultados da Rodada {rodada + 1}:")
        print(f"     Acurácia Média: {acuracia_media_rodada:.4f} ± {desvio_rodada:.4f}")
        print(f"     Min: {min(acuracias_fold_externo):.4f} | Max: {max(acuracias_fold_externo):.4f}")

        resultados_completos['rodada'].append(rodada + 1)
        resultados_completos['acuracia_media'].append(acuracia_media_rodada)
        resultados_completos['desvio_padrao'].append(desvio_rodada)
        resultados_completos['acuracias_folds'].append(acuracias_fold_externo)

        detalhes_rodadas.append({
            'rodada': rodada + 1,
            'acuracia_media': acuracia_media_rodada,
            'desvio_padrao': desvio_rodada,
            'detalhes_folds': detalhes_folds
        })

    print("\n" + "="*60)
    print("RESULTADO FINAL DAS 3 RODADAS")
    print("="*60)

    acuracias_finais = resultados_completos['acuracia_media']
    acuracia_final_media = np.mean(acuracias_finais)
    desvio_final = np.std(acuracias_finais)

    print(f"Acurácia Final do Random Forest: {acuracia_final_media:.4f} ± {desvio_final:.4f}")
    print(f"Intervalo de Confiança (~95%): [{acuracia_final_media - 2*desvio_final:.4f}, {acuracia_final_media + 2*desvio_final:.4f}]")

    return {
        'resultados_completos': resultados_completos,
        'detalhes_rodadas': detalhes_rodadas,
        'acuracia_final_media': acuracia_final_media,
        'desvio_final': desvio_final
    }


In [9]:
print("Iniciando a validação cruzada aninhada com Random Forest...")

print(f"Resultado usando {len(PRE_GAME_FEATURES)} features pré-jogo e {n_classes} classes.")
resultados_rf_pre_jogo = nested_cv_random_forest(
    df, 
    feature_columns=PRE_GAME_FEATURES, 
    target_column=TARGET, 
    n_classes=n_classes, 
    n_outer_folds=10, 
    n_inner_folds=4, 
    n_rounds=3
)

Iniciando a validação cruzada aninhada com Random Forest...
Resultado usando 28 features pré-jogo e 2 classes.
Grid de hiperparâmetros: 8 combinações

RODADA 1/3
----------------------------------------
Fold Externo 1/10 - Acurácia: 0.6164 | Val.Interna: 0.6043
Fold Externo 2/10 - Acurácia: 0.6042 | Val.Interna: 0.6140
Fold Externo 3/10 - Acurácia: 0.6123 | Val.Interna: 0.6054
Fold Externo 4/10 - Acurácia: 0.6380 | Val.Interna: 0.5938
Fold Externo 5/10 - Acurácia: 0.5816 | Val.Interna: 0.6171
Fold Externo 6/10 - Acurácia: 0.6221 | Val.Interna: 0.6116
Fold Externo 7/10 - Acurácia: 0.6331 | Val.Interna: 0.6060
Fold Externo 8/10 - Acurácia: 0.6049 | Val.Interna: 0.6136
Fold Externo 9/10 - Acurácia: 0.6147 | Val.Interna: 0.6120
Fold Externo 10/10 - Acurácia: 0.6025 | Val.Interna: 0.6092

  Resultados da Rodada 1:
     Acurácia Média: 0.6130 ± 0.0154
     Min: 0.5816 | Max: 0.6380

RODADA 2/3
----------------------------------------
Fold Externo 1/10 - Acurácia: 0.6127 | Val.Interna: 0.6080