# Atividade Prática VII

O Sonar Dataset (UCI) é um conjunto de dados amplamente utilizado para avaliar algoritmos de classificação supervisionada. Ele contém 208 amostras, cada uma composta por 60 atributos numéricos contínuos que representam a intensidade do eco de um pulso de sonar refletido por um objeto submerso, classificado como mina metálica (M) ou rocha natural (R). Trata-se de um problema binário, não linear e de alta dimensionalidade, que demanda técnicas de normalização e regularização devido ao número reduzido de exemplos e à semelhança entre as classes. Nesse contexto, a atividade proposta consiste em testar os algoritmos Multilayer Perceptron (MLP) e Support Vector Machine (SVM) para classificar os dados dessa base, analisando o desempenho e o comportamento de cada modelo diante do desafio de separabilidade dos padrões.

* Bibliotecas importadas:

In [1]:
import os
import urllib.request
import zipfile
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42

* Download do arquivo da base de dados

In [2]:
DATA_DIR = Path("data")
ZIP_URL = "https://archive.ics.uci.edu/static/public/151/connectionist+bench+sonar+mines+vs+rocks.zip"
ZIP_PATH = DATA_DIR / "sonar.zip"
CSV_FILENAME = "sonar.all-data"

In [3]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

if not ZIP_PATH.exists():
    try:
        urllib.request.urlretrieve(ZIP_URL, ZIP_PATH)
        print("Download concluído:", ZIP_PATH)
    except Exception as e:
        print("Falha ao baixar automaticamente. Erro:", e)
        print(ZIP_URL)
else:
    print("Arquivo zip já existe em:", ZIP_PATH)

Download concluído: data/sonar.zip


In [4]:
if ZIP_PATH.exists():
    with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
        print("Conteúdo do zip:")
        for info in zf.infolist():
            print(" -", info.filename)
        target = None
        for info in zf.infolist():
            name = info.filename
            if "sonar" in name.lower() and ("all-data" in name.lower() or name.lower().endswith(".csv")):
                target = name
                break
        if target is None:
            print("Arquivo alvo não identificado automaticamente. Extraindo todos os arquivos.")
            zf.extractall(DATA_DIR)
        else:
            print(f"Extraindo {target} para {DATA_DIR}")
            zf.extract(member=target, path=DATA_DIR)
else:
    print("Zip não encontrado. Rode a célula de download ou baixe manualmente.")

Conteúdo do zip:
 - sonar.all-data
 - sonar.mines
 - sonar.rocks
 - Index
 - sonar.names
Extraindo sonar.all-data para data


In [5]:
possible_files = list(DATA_DIR.glob("**/*sonar*.data")) + list(DATA_DIR.glob("**/*sonar*.csv")) + list(DATA_DIR.glob("**/*all-data*"))
possible_files = [p for p in possible_files if p.is_file()]

if not possible_files:
    print("Arquivos na pasta data:", list(DATA_DIR.iterdir()))
else:
    csv_path = possible_files[0]
    print("Usando arquivo:", csv_path)
    df = pd.read_csv(csv_path, header=None)
    print("Shape:", df.shape)
    display(df.head())

Usando arquivo: data/sonar.all-data
Shape: (208, 61)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [6]:

n_cols = df.shape[1]
feature_cols = [f"f{i}" for i in range(n_cols-1)]
cols = feature_cols + ["label"]
df.columns = cols

display(df.info())
display(df.describe().T)

print("Distribuição das classes (M = mina, R = rocha):")
print(df['label'].value_counts())
print("\nProporção:")
print(df['label'].value_counts(normalize=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f0      208 non-null    float64
 1   f1      208 non-null    float64
 2   f2      208 non-null    float64
 3   f3      208 non-null    float64
 4   f4      208 non-null    float64
 5   f5      208 non-null    float64
 6   f6      208 non-null    float64
 7   f7      208 non-null    float64
 8   f8      208 non-null    float64
 9   f9      208 non-null    float64
 10  f10     208 non-null    float64
 11  f11     208 non-null    float64
 12  f12     208 non-null    float64
 13  f13     208 non-null    float64
 14  f14     208 non-null    float64
 15  f15     208 non-null    float64
 16  f16     208 non-null    float64
 17  f17     208 non-null    float64
 18  f18     208 non-null    float64
 19  f19     208 non-null    float64
 20  f20     208 non-null    float64
 21  f21     208 non-null    float64
 22  f2

None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
f0,208.0,0.029164,0.022991,0.0015,0.01335,0.0228,0.03555,0.1371
f1,208.0,0.038437,0.03296,0.0006,0.01645,0.0308,0.04795,0.2339
f2,208.0,0.043832,0.038428,0.0015,0.01895,0.0343,0.05795,0.3059
f3,208.0,0.053892,0.046528,0.0058,0.024375,0.04405,0.0645,0.4264
f4,208.0,0.075202,0.055552,0.0067,0.03805,0.0625,0.100275,0.401
f5,208.0,0.10457,0.059105,0.0102,0.067025,0.09215,0.134125,0.3823
f6,208.0,0.121747,0.061788,0.0033,0.0809,0.10695,0.154,0.3729
f7,208.0,0.134799,0.085152,0.0055,0.080425,0.1121,0.1696,0.459
f8,208.0,0.178003,0.118387,0.0075,0.097025,0.15225,0.233425,0.6828
f9,208.0,0.208259,0.134416,0.0113,0.111275,0.1824,0.2687,0.7106


Distribuição das classes (M = mina, R = rocha):
label
M    111
R     97
Name: count, dtype: int64

Proporção:
label
M    0.533654
R    0.466346
Name: proportion, dtype: float64


In [7]:
OUT_CSV = DATA_DIR / "sonar_all_data.csv"
df.to_csv(OUT_CSV, index=False)
print("Salvo em:", OUT_CSV)


Salvo em: data/sonar_all_data.csv


In [8]:

DATA_DIR = Path("data")
CSV_PATH = DATA_DIR / "sonar_all_data.csv"

if not CSV_PATH.exists():

    possibles = list(DATA_DIR.glob("**/*sonar*.*"))
    if possibles:
        CSV_PATH = possibles[0]
    else:
        raise FileNotFoundError(f"Não encontrou o CSV em {CSV_PATH}. Coloque o arquivo extraído em data/ ou ajuste o caminho.")

df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
display(df.head())

if 'label' in df.columns:
    X = df.drop(columns=['label']).values
    y = df['label'].values
else:
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

# converte labels para binário numérico (opcional, muitos estimadores aceitam str)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)  # M/R -> 0/1
print("Classes:", le.classes_)


Shape: (208, 61)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f51,f52,f53,f54,f55,f56,f57,f58,f59,label
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


Classes: ['M' 'R']


* Definições de modelos e grades de hiperparâmetros

In [14]:

# MLP
mlp = MLPClassifier(activation='relu', solver='adam', max_iter=3000, random_state=RANDOM_STATE)

mlp_param_grid = {
    'hidden_layer_sizes': [(60,), (60,30), (30,30), (100,), (50,25)], # neuronios por camada
    'alpha': [1e-4, 1e-3, 1e-2],   # regularização L2
    'learning_rate_init': [1e-3, 1e-4]
}

# SVM (RBF)
svm = SVC(kernel='rbf', gamma='scale', probability=False, random_state=RANDOM_STATE)
svm_param_grid = {
    'C': [0.1, 1.0, 10.0, 100.0]
}

models_and_grids = {
    'MLP': (mlp, mlp_param_grid),
    'SVM': (svm, svm_param_grid)
}


* Função para executar experimentos por cenário (com/sem PCA)

In [15]:

def run_experiments(X, y, models_and_grids, use_pca=False, n_components=10, n_splits=5):
    """
    Executa StratifiedKFold (n_splits) com inner GridSearchCV (cv=3) para cada modelo.
    Retorna DataFrame com métricas por fold e resumo agregado.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    results = []  # guarda uma linha por (modelo, fold)
    fold_idx = 0

    for train_idx, test_idx in skf.split(X, y):
        fold_idx += 1
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Escalar sempre com parâmetros do treino
        scaler = StandardScaler().fit(X_train)
        X_train_s = scaler.transform(X_train)
        X_test_s = scaler.transform(X_test)

        # aplicar PCA se pedido (fit apenas no treino)
        if use_pca:
            pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
            X_train_proc = pca.fit_transform(X_train_s)
            X_test_proc = pca.transform(X_test_s)
        else:
            pca = None
            X_train_proc = X_train_s
            X_test_proc = X_test_s

        for model_name, (estimator, param_grid) in models_and_grids.items():
            # GridSearchCV para escolher hiperparâmetros com inner CV (cv=3) -> otimizar f1_macro
            gs = GridSearchCV(estimator=estimator, param_grid=param_grid,
                              scoring='f1_macro', cv=3, n_jobs=-1, verbose=0)
            gs.fit(X_train_proc, y_train)
            best = gs.best_estimator_

            # treina com melhorr no conjunto de treino completo (já ajustado pelo GS), prediz no teste
            y_pred = best.predict(X_test_proc)

            # métricas
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
            rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
            cm = confusion_matrix(y_test, y_pred)

            results.append({
                'scenario_pca': use_pca,
                'n_components': n_components if use_pca else None,
                'fold': fold_idx,
                'model': model_name,
                'best_params': gs.best_params_,
                'accuracy': acc,
                'precision_macro': prec,
                'recall_macro': rec,
                'f1_macro': f1,
                'confusion_matrix': cm
            })

            print(f"Fold {fold_idx} | {'PCA' if use_pca else 'NoPCA'} | {model_name} | best: {gs.best_params_} | f1_macro: {f1:.4f}")

    results_df = pd.DataFrame(results)
    return results_df


Rodar ambos os cenários (sem PCA / com PCA)

In [16]:

N_COMPONENTS = 10
N_SPLITS = 5

print("Rodando cenário: sem PCA")
res_no_pca = run_experiments(X, y_enc, models_and_grids, use_pca=False, n_components=None, n_splits=N_SPLITS)

print("\nRodando cenário: com PCA (n_components={})".format(N_COMPONENTS))
res_pca = run_experiments(X, y_enc, models_and_grids, use_pca=True, n_components=N_COMPONENTS, n_splits=N_SPLITS)


Rodando cenário: sem PCA
Fold 1 | NoPCA | MLP | best: {'alpha': 0.0001, 'hidden_layer_sizes': (60,), 'learning_rate_init': 0.0001} | f1_macro: 0.8078
Fold 1 | NoPCA | SVM | best: {'C': 1.0} | f1_macro: 0.8558
Fold 2 | NoPCA | MLP | best: {'alpha': 0.01, 'hidden_layer_sizes': (30, 30), 'learning_rate_init': 0.0001} | f1_macro: 0.8568
Fold 2 | NoPCA | SVM | best: {'C': 10.0} | f1_macro: 0.8803
Fold 3 | NoPCA | MLP | best: {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.0001} | f1_macro: 0.8332
Fold 3 | NoPCA | SVM | best: {'C': 10.0} | f1_macro: 0.8568
Fold 4 | NoPCA | MLP | best: {'alpha': 0.0001, 'hidden_layer_sizes': (50, 25), 'learning_rate_init': 0.0001} | f1_macro: 0.8995
Fold 4 | NoPCA | SVM | best: {'C': 10.0} | f1_macro: 0.9261
Fold 5 | NoPCA | MLP | best: {'alpha': 0.0001, 'hidden_layer_sizes': (60,), 'learning_rate_init': 0.001} | f1_macro: 0.8276
Fold 5 | NoPCA | SVM | best: {'C': 10.0} | f1_macro: 0.8529

Rodando cenário: com PCA (n_components=10)
Fol

  Agregar resultados e mostrar resumo (médias e desvios)

In [17]:

all_res = pd.concat([res_no_pca, res_pca], ignore_index=True)

summary = all_res.groupby(['scenario_pca', 'model']).agg(
    folds=('fold','count'),
    acc_mean=('accuracy','mean'),
    acc_std=('accuracy','std'),
    prec_mean=('precision_macro','mean'),
    prec_std=('precision_macro','std'),
    rec_mean=('recall_macro','mean'),
    rec_std=('recall_macro','std'),
    f1_mean=('f1_macro','mean'),
    f1_std=('f1_macro','std')
).reset_index()

summary['scenario'] = summary['scenario_pca'].map({False: 'No PCA', True: f'PCA ({N_COMPONENTS})'})

display(summary[['scenario','model','folds','acc_mean','acc_std','prec_mean','prec_std','rec_mean','rec_std','f1_mean','f1_std']])


Unnamed: 0,scenario,model,folds,acc_mean,acc_std,prec_mean,prec_std,rec_mean,rec_std,f1_mean,f1_std
0,No PCA,MLP,5,0.846341,0.035637,0.851694,0.042943,0.84472,0.033358,0.844995,0.03513
1,No PCA,SVM,5,0.875145,0.030882,0.876404,0.03135,0.874403,0.030017,0.874398,0.030936
2,PCA (10),MLP,5,0.836818,0.070765,0.840875,0.071789,0.835827,0.070263,0.835425,0.07162
3,PCA (10),SVM,5,0.870383,0.055119,0.870972,0.055502,0.871293,0.055261,0.870027,0.055186


Salvar resultados e imprimir matrizes de confusão médias

In [18]:

OUT_DIR = Path("results")
OUT_DIR.mkdir(exist_ok=True)

all_res.to_pickle(OUT_DIR / "detailed_results.pkl")
summary.to_csv(OUT_DIR / "summary_results.csv", index=False)
print("Resultados salvos em:", OUT_DIR)

cm_summary = {}
for (scen, model), group in all_res.groupby(['scenario_pca','model']):
    cms = np.array([g for g in group['confusion_matrix']])
    cm_sum = cms.sum(axis=0)
    cm_summary[(scen, model)] = cm_sum
    print(f"\nMatriz de confusão - {'PCA' if scen else 'NoPCA'} - {model} (soma dos folds):\n", cm_sum)


Resultados salvos em: results

Matriz de confusão - NoPCA - MLP (soma dos folds):
 [[97 14]
 [18 79]]

Matriz de confusão - NoPCA - SVM (soma dos folds):
 [[99 12]
 [14 83]]

Matriz de confusão - PCA - MLP (soma dos folds):
 [[95 16]
 [18 79]]

Matriz de confusão - PCA - SVM (soma dos folds):
 [[96 15]
 [12 85]]
