In [19]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Nomes das colunas para o dataset Adult
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Carregar o dataset
# Se você baixou o arquivo, use:
# df = pd.read_csv('adult.data', header=None, names=column_names, na_values=' ?')
# Ou carregue diretamente da URL:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None, names=column_names, na_values=' ?')


In [20]:
# Padronização dos campos categóricos

# Remover espaços extras
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Capitalização consistente (tudo Title Case)
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.title()

# Tratar valores ausentes já foi feito com na_values='?'
df.isna().sum()


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,0
age,0
workclass,1836
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,1843
relationship,0
race,0
sex,0


In [21]:
# Converter valores categóricos para categorias padronizadas
# exemplo: "United-States" vs "United States"

df['native-country'] = df['native-country'].str.replace('-', ' ')
df['workclass'] = df['workclass'].str.replace('-', ' ')
df['marital-status'] = df['marital-status'].str.replace('-', ' ')
df['relationship'] = df['relationship'].str.replace('-', ' ')
df['race'] = df['race'].str.replace('-', ' ')
df['income'] = df['income'].str.replace('-', ' ')
df['occupation'] = df['occupation'].str.replace('-', ' ')


In [22]:
# Separar colunas numéricas para normalização
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df_num = df[numerical_cols].copy()

df_num.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [23]:
# Aplicar Min-Max Scaling

min_max_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(min_max_scaler.fit_transform(df_num), columns=numerical_cols)

df_minmax.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [24]:
# Aplicar Z-score Standardization

z_scaler = StandardScaler()
df_zscore = pd.DataFrame(z_scaler.fit_transform(df_num), columns=numerical_cols)

df_zscore.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [25]:
# Aplicar Robust Scaling

robust_scaler = RobustScaler()
df_robust = pd.DataFrame(robust_scaler.fit_transform(df_num), columns=numerical_cols)

df_robust.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.1,-0.845803,1.0,2174.0,0.0,0.0
1,0.65,-0.797197,1.0,0.0,0.0,-5.4
2,0.05,0.312773,-0.333333,0.0,0.0,0.0
3,0.8,0.472766,-1.0,0.0,0.0,0.0
4,-0.45,1.342456,1.0,0.0,0.0,0.0


In [26]:
# Comparar resultados das técnicas

comparison = pd.DataFrame({
    'Original_mean': df_num.mean(),
    'MinMax_mean': df_minmax.mean(),
    'ZScore_mean': df_zscore.mean(),
    'Robust_mean': df_robust.mean()
})

comparison


Unnamed: 0,Original_mean,MinMax_mean,ZScore_mean,Robust_mean
age,38.581647,0.295639,-2.705915e-17,0.079082
fnlwgt,189778.366512,0.120545,-1.001625e-16,0.095806
education-num,10.080679,0.605379,1.471887e-16,0.026893
capital-gain,1077.648844,0.010777,1.309314e-17,1077.648844
capital-loss,87.30383,0.020042,1.0169e-16,87.30383
hours-per-week,40.437456,0.402423,-1.5493550000000002e-17,0.087491


In [27]:
# Exibir desvios padrão também

comparison_std = pd.DataFrame({
    'Original_std': df_num.std(),
    'MinMax_std': df_minmax.std(),
    'ZScore_std': df_zscore.std(),
    'Robust_std': df_robust.std()
})

comparison_std


Unnamed: 0,Original_std,MinMax_std,ZScore_std,Robust_std
age,13.640433,0.186855,1.000015,0.682022
fnlwgt,105549.977697,0.071685,1.000015,0.885308
education-num,2.57272,0.171515,1.000015,0.857573
capital-gain,7385.292085,0.073854,1.000015,7385.292085
capital-loss,402.960219,0.092507,1.000015,402.960219
hours-per-week,12.347429,0.125994,1.000015,2.469486
