In [None]:
!pip install aequitas

In [None]:
# Importações de bibliotecas necessárias para o tratamento de dados
import math
import pandas as pd
import numpy as np

# Importações de bibliotecas para manipulação de dados e modelos
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle # Embaralhar os dados

# Normalização de dados
from sklearn.preprocessing import OneHotEncoder # Converte categorias em colunas binárias (0 ou 1)
from sklearn.preprocessing import LabelEncoder   # Converte categorias em números
from sklearn.preprocessing import StandardScaler # Normalização usando Z-score

# Importações para estatísticas e testes estatísticos
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import norm

# Métricas de avaliação de modelos
from sklearn.metrics import accuracy_score, classification_report,accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve


# Modelos de machine learning do scikit-learn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Importações de bibliotecas para redes neurais usando TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Reshape, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

# Bibliotecas para gráficos
import matplotlib.pyplot as plt
import seaborn as sns

from aequitas import Audit
from aequitas.plotting import Plot

# Estrutura padrão para modelos e análise
# Aqui, você pode incluir funções e classes de processamento de dados,
# criação de modelos e avaliação, mantendo o código modular e organizado.


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
train_url = '/content/adult.data'
link_index = '/content/Index'
link_informacoes = '/content/adult.names'
test_url = '/content/adult.test'

# Nome das colunas do dataset
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

In [None]:
test_amostra = pd.read_csv(test_url, header=0, names=column_names, na_values=" ?", sep=',\s', engine='python')
test_data_adult = test_amostra.sample(n=10000, random_state=42)  # `random_state` é para garantir que o resultado seja reprodutível

test_data_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
13633,29,Private,189346,HS-grad,9,Never-married,Transport-moving,Unmarried,White,Male,0,0,40,United-States,<=50K.
1921,31,Private,137076,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K.
12140,52,Federal-gov,35546,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K.
9933,54,Local-gov,116428,10th,6,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K.
7745,41,Private,177054,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,White,Male,0,0,40,United-States,<=50K.


In [None]:
train_amostra = pd.read_csv(train_url, header=None, names=column_names, na_values=" ?", sep=',\s', engine='python')  # Carregar dados
train_data_adult = train_amostra.sample(n=20000, random_state=42)  # `random_state` é para garantir que o resultado seja reprodutível
train_data_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States,<=50K
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States,>50K
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
7827,29,Self-emp-not-inc,189346,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,2202,0,50,United-States,<=50K


In [None]:
# Substituir strings vazias por NaN e remover linhas com valores nulos
train_data_adult.dropna(inplace=True)
test_data_adult.dropna(inplace=True)


# Codificar a variável alvo ('income') transformando em binarios
train_data_adult['income'] = train_data_adult['income'].apply(lambda x: 1 if x == '>50K' else 0)
test_data_adult['income'] = test_data_adult['income'].apply(lambda x: 1 if x == '>50K.' else 0)

# Codificação de variáveis categóricas para binarios
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_train_features = encoder.fit_transform(train_data_adult[categorical_cols])
encoded_test_features = encoder.transform(test_data_adult[categorical_cols])


In [None]:
# Concatenação das colunas codificadas com as colunas numéricas
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
X_train = pd.concat([pd.DataFrame(encoded_train_features), train_data_adult[numeric_cols].reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(encoded_test_features), test_data_adult[numeric_cols].reset_index(drop=True)], axis=1)

# Garantir que todos os nomes das colunas são strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Variável alvo
y_train = train_data_adult['income']
y_test = test_data_adult['income']

# Padronização das variáveis numéricas
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

# Parâmetros globais
z_dim = 100
data_dim = X_train.shape[1]  # Número de características dos dados reais

# Parâmetros otimizados
best_params = {'learning_rate': 0.0002, 'generator_units': 32, 'discriminator_units': 128}

# Função de perda do discriminador
def loss_discriminador(y_true, y_pred):
    y_true = tf.where(tf.equal(y_true, 1), 0.9, y_true)  # Suavizar rótulo real
    return tf.keras.losses.binary_crossentropy(y_true, y_pred)

# Penalização do gradiente
def penalizacao_gradiente(discriminador, real_samples, fake_samples):
    with tf.GradientTape() as tape:
        tape.watch(real_samples)
        output = discriminador(real_samples, training=True)
    grad_real = tape.gradient(output, real_samples)
    norm_grad_real = tf.sqrt(tf.reduce_sum(tf.square(grad_real), axis=1))
    grad_penalty = tf.reduce_mean(tf.square(norm_grad_real - 1.0))
    return grad_penalty

# Função para criar o Gerador
def build_generator(generator_units):
    model = Sequential()
    model.add(Dense(generator_units, input_dim=z_dim))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(Dense(data_dim, activation='tanh'))
    return model

# Função para criar o Discriminador
def build_discriminator(discriminator_units):
    model = Sequential()
    model.add(Dense(discriminator_units, input_dim=data_dim))
    model.add(BatchNormalization())
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=best_params['learning_rate'])
    model.compile(loss=loss_discriminador, optimizer=optimizer, metrics=['accuracy'])
    return model

# Modelo GAN
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=best_params['learning_rate']))
    return model

# Treinamento da GAN
def train_gan(generator, discriminator, gan, epochs=2000, batch_size=128, lambda_gp=10):
    for epoch in range(epochs):
        # Amostras reais e falsas
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_data = tf.convert_to_tensor(X_train[idx], dtype=tf.float32)  # Converta para tensor
        noise = np.random.normal(0, 1, (batch_size, z_dim))
        noise = tf.convert_to_tensor(noise, dtype=tf.float32)  # Converta para tensor
        fake_data = generator.predict(noise)

        # Converta fake_data para tensor
        fake_data = tf.convert_to_tensor(fake_data, dtype=tf.float32)

        # Treinamento do Discriminador
        with tf.GradientTape() as tape:
            d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
            d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((batch_size, 1)))
            d_loss = 0.5 * (d_loss_real[0] + d_loss_fake[0])
            grad_penalty = penalizacao_gradiente(discriminator, real_data, fake_data)
            d_loss_total = d_loss + lambda_gp * grad_penalty

        # Atualizar pesos do Discriminador
        grads = tape.gradient(d_loss_total, discriminator.trainable_variables)
        if grads:
            discriminator.optimizer.apply_gradients(zip(grads, discriminator.trainable_variables))

        # Treinamento do Gerador
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

        # Exibir perdas e acurácia a cada 20 épocas
        if epoch % 20 == 0:
            d_acc_real = d_loss_real[1]  # Acurácia do discriminador com dados reais
            d_acc_fake = d_loss_fake[1]  # Acurácia do discriminador com dados falsos
            d_acc = 0.5 * (d_acc_real + d_acc_fake)  # Média das acurácias
            print(f"Epoch {epoch}: [Discriminator loss: {d_loss_total:.4f} | Accuracy: {100*d_acc:.2f}%] [Generator loss: {g_loss}]")

# Construir os modelos
generator = build_generator(best_params['generator_units'])
discriminator = build_discriminator(best_params['discriminator_units'])
gan = build_gan(generator, discriminator)

# Treinar a GAN
train_gan(generator, discriminator, gan, epochs=1000, batch_size=128)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 0: [Discriminator loss: 6.3813 | Accuracy: 64.84%] [Generator loss: [array(0.68566287, dtype=float32), array(0.68566287, dtype=float32), array(0.59375, dtype=float32)]]
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━