### Atividade de Classificadores
database: https://www.kaggle.com/datasets/bobbyscience/league-of-legends-diamond-ranked-games-10-min

In [1]:
# 1. Importar bibliotecas essenciais
import pandas as pd
import numpy as np

# Modelos e ferramentas para classificação e validação
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Métricas
from sklearn.metrics import accuracy_score

# Ignore warnings para deixar o output limpo
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 2. Carregar os dados
# Supondo que o arquivo esteja no formato CSV chamado 'lol_matches.csv'
df = pd.read_csv('../../datasets/high_diamond_ranked_10min.csv')

# Vamos dar uma olhada nas primeiras linhas
print(df.head())

       gameId  blueWins  blueWardsPlaced  blueWardsDestroyed  blueFirstBlood  \
0  4519157822         0               28                   2               1   
1  4523371949         0               12                   1               0   
2  4521474530         0               15                   0               0   
3  4524384067         0               43                   1               0   
4  4436033771         0               75                   4               0   

   blueKills  blueDeaths  blueAssists  blueEliteMonsters  blueDragons  ...  \
0          9           6           11                  0            0  ...   
1          5           5            5                  0            0  ...   
2          7          11            4                  1            1  ...   
3          4           5            5                  1            0  ...   
4          6           6            6                  0            0  ...   

   redTowersDestroyed  redTotalGold  redAvgLevel  

In [3]:
# 3. Entendendo os dados

# Mostrar informações básicas e checar dados faltantes
print(df.info())

# Visualizar a distribuição da variável target (a ser prevista)
print(df['blueWins'].value_counts())

# Aqui, 'blueWins' é a variável target:
# 1 se o time azul venceu, 0 se perdeu

# Vamos definir features e target:
X = df.drop(columns=['gameId', 'blueWins'])  # Remove 'gameId' (não útil para predição) e target
y = df['blueWins']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gameId                        9879 non-null   int64  
 1   blueWins                      9879 non-null   int64  
 2   blueWardsPlaced               9879 non-null   int64  
 3   blueWardsDestroyed            9879 non-null   int64  
 4   blueFirstBlood                9879 non-null   int64  
 5   blueKills                     9879 non-null   int64  
 6   blueDeaths                    9879 non-null   int64  
 7   blueAssists                   9879 non-null   int64  
 8   blueEliteMonsters             9879 non-null   int64  
 9   blueDragons                   9879 non-null   int64  
 10  blueHeralds                   9879 non-null   int64  
 11  blueTowersDestroyed           9879 non-null   int64  
 12  blueTotalGold                 9879 non-null   int64  
 13  blu

In [4]:
# 4. Escolher os classificadores

# Vamos usar:
# - Logistic Regression
# - Random Forest
# - SVM (Support Vector Machine)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

In [5]:
# 5. Avaliar os modelos com 5-fold cross-validation

from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"{name}: Média de acurácia: {scores.mean():.4f} ± {scores.std():.4f}")

Logistic Regression: Média de acurácia: 0.7312 ± 0.0134
Random Forest: Média de acurácia: 0.7171 ± 0.0141
SVM: Média de acurácia: 0.7270 ± 0.0127


In [6]:
# 6. Ajustar hiperparâmetros (tuning) para 2 classificadores

# 6.1 Random Forest
param_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_rf, cv=cv, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X, y)
print("Melhores parâmetros Random Forest:", grid_rf.best_params_)
print(f"Melhor acurácia Random Forest: {grid_rf.best_score_:.4f}")

# 6.2 SVM
param_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(random_state=42), param_svm, cv=cv, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X, y)
print("Melhores parâmetros SVM:", grid_svm.best_params_)
print(f"Melhor acurácia SVM: {grid_svm.best_score_:.4f}")

Melhores parâmetros Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Melhor acurácia Random Forest: 0.7269
Melhores parâmetros SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Melhor acurácia SVM: 0.7331


In [7]:
# 7. Comparar resultados finais

print(f"Logistic Regression (padrão): {cross_val_score(models['Logistic Regression'], X, y, cv=cv, scoring='accuracy').mean():.4f}")
print(f"Random Forest (tuned): {grid_rf.best_score_:.4f}")
print(f"SVM (tuned): {grid_svm.best_score_:.4f}")

# Identificando o melhor modelo
best_model_name = None
best_score = 0

if grid_rf.best_score_ > best_score:
    best_score = grid_rf.best_score_
    best_model_name = 'Random Forest (tuned)'
if grid_svm.best_score_ > best_score:
    best_score = grid_svm.best_score_
    best_model_name = 'SVM (tuned)'
if cross_val_score(models['Logistic Regression'], X, y, cv=cv, scoring='accuracy').mean() > best_score:
    best_score = cross_val_score(models['Logistic Regression'], X, y, cv=cv, scoring='accuracy').mean()
    best_model_name = 'Logistic Regression (default)'

print(f"\nMelhor modelo: {best_model_name} com acurácia de {best_score:.4f}")

Logistic Regression (padrão): 0.7312
Random Forest (tuned): 0.7269
SVM (tuned): 0.7331

Melhor modelo: SVM (tuned) com acurácia de 0.7331
