# Notebook 1 - Aprendizagem Supervisionada

In [1]:
print("Hello World!")

%pip install --upgrade pip
%pip install numpy
%pip install matplotlib
%pip install pandas
%pip install mlxtend

Hello World!
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Importar o Dataset e preparar os dados

In [2]:
import pandas as pd


male_players = pd.read_csv("Data/EA_FC/male_players.csv",low_memory=False)
female_players = pd.read_csv("Data/EA_FC/female_players.csv")


In [3]:
male_players["gender"] = "M"
female_players["gender"] = "F"

players = pd.concat([male_players.assign(gender="M"),
                     female_players.assign(gender="F")],
                    ignore_index=True)

players24 = players[players["fifa_version"] == 24].copy()


# Selecionar os dados relevantes

### Atribui os grupos de posições baseado na principal posição do jogador. Junta numa coluna nova 'position_group'.


In [4]:
def map_position_group(positions_str: str) -> str:
    if pd.isna(positions_str):
        return "OTHER"
    main_pos = positions_str.split(",")[0].strip()
    defenders = {"CB", "LB", "RB", "LWB", "RWB"}
    mids = {"CDM", "CM", "CAM", "LM", "RM"}
    attackers = {"ST", "CF", "LW", "RW"}

    if main_pos == "GK":
        return "GK"
    elif main_pos in defenders:
        return "DEF"
    elif main_pos in mids:
        return "MID"
    elif main_pos in attackers:
        return "ATT"
    else:
        return "OTHER"

players24["position_group"] = players24["player_positions"].apply(map_position_group)


In [5]:
feature_cols = [
    "pace", "shooting", "passing", "dribbling",
    "defending", "physic",
    "height_cm", "weight_kg", "age",
    "movement_acceleration", "movement_sprint_speed",
    "movement_agility", "movement_balance",
    "power_strength", "power_stamina"
]


data = players24.dropna(subset=feature_cols + ["position_group"]).copy()

X = data[feature_cols]
y = data["position_group"]


## Dividir os dados em treino e teste, e escalar as features para melhorar a performance dos modelos

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7213, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Treinar e avaliar vários modelos de classificação

Foram utilizados três modelos de classificação: Regressão Logística, Random Forest e K-Nearest Neighbors (KNN). Cada modelo foi treinado com os dados de treino escalados e avaliado com os dados de teste escalados. As métricas de avaliação incluem precisão, recall e F1-score para cada classe

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

models = {
    "LogisticRegression": LogisticRegression(max_iter=100000),
    "RandomForest": RandomForestClassifier(random_state=6452),
    "KNN": KNeighborsClassifier()
}

for name, clf in models.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)


    print("====", name, "====")
    print(classification_report(y_test, y_pred))


==== LogisticRegression ====
              precision    recall  f1-score   support

         ATT       0.84      0.77      0.80       772
         DEF       0.87      0.88      0.87      1353
         MID       0.78      0.80      0.79      1461

    accuracy                           0.82      3586
   macro avg       0.83      0.82      0.82      3586
weighted avg       0.82      0.82      0.82      3586

==== RandomForest ====
              precision    recall  f1-score   support

         ATT       0.85      0.75      0.80       772
         DEF       0.88      0.89      0.88      1353
         MID       0.78      0.82      0.80      1461

    accuracy                           0.83      3586
   macro avg       0.84      0.82      0.83      3586
weighted avg       0.83      0.83      0.83      3586

==== KNN ====
              precision    recall  f1-score   support

         ATT       0.78      0.72      0.75       772
         DEF       0.85      0.86      0.85      1353
         

# Determinar o melhor modelo e otimizar seus hiperparâmetros com GridSearchCV

 Com base nos resultados iniciais, o melhor modelo será selecionado e seus hiperparâmetros serão otimizados usando GridSearchCV para melhorar ainda mais o desempenho.

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# 1. Avaliar e selecionar o melhor modelo inicial
model_scores = {}
for name, clf in models.items():
    # O modelo já foi treinado na célula anterior
    y_pred = clf.predict(X_test_scaled)
    score = f1_score(y_test, y_pred, average="macro")
    model_scores[name] = score
    print(f"F1-Score (macro) para {name}: {score:.4f}")

best_model_name = max(model_scores, key=model_scores.get)
print(f"\nMelhor modelo inicial: {best_model_name} com F1-Score de {model_scores[best_model_name]:.4f}")


# 2. Definir grelhas de parâmetros para cada modelo
param_grids = {
    "LogisticRegression": {
        "C": [0.1, 1.0, 10.0],
        "solver": ["liblinear", "saga"]
    },
    "RandomForest": {
        "n_estimators": [100, 200],
        "max_depth": [10, 20, None],
        "min_samples_split": [2, 5],
    },
    "KNN": {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    }
}

# 3. Otimizar o melhor modelo com GridSearchCV
best_model_base = models[best_model_name]
grid_to_use = param_grids[best_model_name]

print(f"\nIniciando GridSearchCV para o modelo: {best_model_name}...")
grid = GridSearchCV(
    estimator=best_model_base,
    param_grid=grid_to_use,
    scoring="f1_macro",
    n_jobs=-1,
    cv=5
)
grid.fit(X_train_scaled, y_train)

# 4. Avaliar o modelo otimizado
print("\nMelhores parâmetros encontrados:", grid.best_params_)
best_model_tuned = grid.best_estimator_
y_pred_tuned = best_model_tuned.predict(X_test_scaled)

print("\nRelatório de classificação para o modelo otimizado:")
print(classification_report(y_test, y_pred_tuned))

print(f"F1-Score  do modelo original: {f1_score(y_test, models[best_model_name].predict(X_test_scaled), average='macro'):.4f}")
print(f"F1-Score (macro) do modelo otimizado: {f1_score(y_test, y_pred_tuned, average='macro'):.4f}")


F1-Score (macro) para LogisticRegression: 0.8221
F1-Score (macro) para RandomForest: 0.8264
F1-Score (macro) para KNN: 0.7832

Melhor modelo inicial: RandomForest com F1-Score de 0.8264

Iniciando GridSearchCV para o modelo: RandomForest...

Melhores parâmetros encontrados: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}

Relatório de classificação para o modelo otimizado:
              precision    recall  f1-score   support

         ATT       0.85      0.75      0.80       772
         DEF       0.88      0.89      0.88      1353
         MID       0.78      0.82      0.80      1461

    accuracy                           0.83      3586
   macro avg       0.84      0.82      0.83      3586
weighted avg       0.83      0.83      0.83      3586

F1-Score  do modelo original: 0.8264
F1-Score (macro) do modelo otimizado: 0.8264
