# Notebook 1 - Aprendizagem Supervisionada

In [1]:
print("Hello World!")

%pip install --upgrade pip
%pip install numpy
%pip install matplotlib
%pip install pandas
%pip install mlxtend

Hello World!
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Importar o Dataset e preparar os dados

In [2]:
import pandas as pd


male_players = pd.read_csv("Data/EA_FC/male_players.csv",low_memory=False)
female_players = pd.read_csv("Data/EA_FC/female_players.csv")


In [3]:
male_players["gender"] = "M"
female_players["gender"] = "F"

players = pd.concat([male_players.assign(gender="M"),
                     female_players.assign(gender="F")],
                    ignore_index=True)

players24 = players[players["fifa_version"] == 24].copy()


# Selecionar os dados relevantes

### Atribui os grupos de posições baseado na principal posição do jogador

In [4]:
def map_position_group(positions_str: str) -> str:
    if pd.isna(positions_str):
        return "OTHER"
    main_pos = positions_str.split(",")[0].strip()
    defenders = {"CB", "LB", "RB", "LWB", "RWB"}
    mids = {"CDM", "CM", "CAM", "LM", "RM"}
    attackers = {"ST", "CF", "LW", "RW"}

    if main_pos == "GK":
        return "GK"
    elif main_pos in defenders:
        return "DEF"
    elif main_pos in mids:
        return "MID"
    elif main_pos in attackers:
        return "ATT"
    else:
        return "OTHER"

players24["position_group"] = players24["player_positions"].apply(map_position_group)


In [5]:
feature_cols = [
    "pace", "shooting", "passing", "dribbling",
    "defending", "physic",
    "height_cm", "weight_kg", "age",
    "movement_acceleration", "movement_sprint_speed",
    "movement_agility", "movement_balance",
    "power_strength", "power_stamina"
]


data = players24.dropna(subset=feature_cols + ["position_group"]).copy()

X = data[feature_cols]
y = data["position_group"]


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7213, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)





In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    "LogisticRegression": LogisticRegression(max_iter=100000),
    "RandomForest": RandomForestClassifier(random_state=6452),
    "KNN": KNeighborsClassifier()
}

for name, clf in models.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)


    print("====", name, "====")
    print(classification_report(y_test, y_pred))


==== LogisticRegression ====
              precision    recall  f1-score   support

         ATT       0.84      0.77      0.80       772
         DEF       0.87      0.88      0.87      1353
         MID       0.78      0.80      0.79      1461

    accuracy                           0.82      3586
   macro avg       0.83      0.82      0.82      3586
weighted avg       0.82      0.82      0.82      3586

==== RandomForest ====
              precision    recall  f1-score   support

         ATT       0.85      0.75      0.80       772
         DEF       0.88      0.89      0.88      1353
         MID       0.78      0.82      0.80      1461

    accuracy                           0.83      3586
   macro avg       0.84      0.82      0.83      3586
weighted avg       0.83      0.83      0.83      3586

==== KNN ====
              precision    recall  f1-score   support

         ATT       0.78      0.72      0.75       772
         DEF       0.85      0.86      0.85      1353
         

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

rf = RandomForestClassifier(random_state=221)
grid = GridSearchCV(rf, param_grid, scoring="f1_macro", n_jobs=-1)
grid.fit(X_train_scaled, y_train)

best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test_scaled)

print("Best params:", grid.best_params_)
print(classification_report(y_test, y_pred))


Best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

         ATT       0.86      0.74      0.80       772
         DEF       0.89      0.88      0.88      1353
         MID       0.77      0.83      0.80      1461

    accuracy                           0.83      3586
   macro avg       0.84      0.82      0.83      3586
weighted avg       0.83      0.83      0.83      3586

