In [3]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers 
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest


In [4]:
# Carregar dataset Wine Quality
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [6]:
# Preencher valores nulos com média das colunas
df.fillna(df.mean(), inplace=True)

# Criar uma coluna derivada
df['total_phenols_and_flavonoids'] = df['total_phenols'] + df['flavanoids']

# Exibir as primeiras linhas do df
print(df.head())

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  target  total_phenol

In [8]:
# Padronizar os dados
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Remover duplicatas
df_scaled.drop_duplicates(inplace=True)

# Exibir as primeiras linhas do df padronizado
print(df_scaled.head())

    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  1.518613   -0.562250  0.232053          -1.169593   1.913905   
1  0.246290   -0.499413 -0.827996          -2.490847   0.018145   
2  0.196879    0.021231  1.109334          -0.268738   0.088358   
3  1.691550   -0.346811  0.487926          -0.809251   0.930918   
4  0.295700    0.227694  1.840403           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  \
0         0.251717  0.362177                      1.847920  1.013009   
1        -0.293321  0.4060

In [10]:
# Anonimizar a coluna de alvo
df_scaled['target'] = df_scaled['target'].astype('category').cat.codes

# Detectar anomalias
clf = IsolationForest(contamination=0.05)
df_scaled['anomaly'] = clf.fit_predict(df_scaled)

# Exibir as linhas detectadas com anomalias
print(df_scaled[df_scaled['anomaly'] == -1])

      alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
3    1.691550   -0.346811  0.487926          -0.809251   0.930918   
13   2.160950   -0.544297  0.085839          -2.430790  -0.613775   
59  -0.778980   -1.253450 -3.679162          -2.671018  -0.824415   
69  -0.976623   -1.029035 -2.253579          -0.809251   3.599025   
73  -0.013116   -0.598156  0.853460           3.154511   2.756465   
110 -1.903071    1.260006 -1.997705           0.001518   0.509638   
121 -1.779545   -0.257044  3.156325           2.704083   1.352198   
146  1.086270    2.426968 -0.499016           0.151661  -1.386122   
158  1.654492   -0.589180  1.218995           1.653086  -0.122282   

     total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
3         2.491446    1.466525             -0.981875         1.032155   
13        1.289697    1.667318              0.549108         2.135968   
59       -0.504914   -1.465058             -0.659563        -2.051513   
69       -0.71321

In [11]:
# Definir o gerador da GAN
def build_generator(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(input_dim, activation='sigmoid'))
    return model 

# Definir o discriminador da GAN
def build_discriminator(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model 

In [13]:
input_dim = 13

# Compilar e treinar a GAN
generator = build_generator(input_dim)
discriminator = build_discriminator(input_dim)
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

discriminator.trainable = False
gan_input = layers.Input(shape=(input_dim,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

In [14]:
# Função para treinar a GAN e gerar uma amostra de dados
def train_gan(generator, discriminator, gan, df_scaled, epochs=1000, batch_size=128, sample_size=10):
    for epoch in range(epochs):
        noise = np.random.normal(0, 1, (batch_size, input_dim))
        generated_data = generator.predict(noise)

        # Selecionar apenas as primeiras 13 colunas de df_scaled
        real_data = df_scaled.sample(batch_size).iloc[:, :input_dim].values

        # Garantir que real_data tenha a mesma dimensão que generated_data
        if real_data.shape[1] != input_dim:
            raise ValueError(f"Dimensão de real_data ({real_data.shape[1]}) não coincide com input_dim ({input_dim})")

        combined_data = np.concatenate([real_data, generated_data])
        labels = np.concatenate([np.ones((batch_size, 1)), np.zeros((batch_size, 1))])

        d_loss = discriminator.train_on_batch(combined_data, labels)
        noise = np.random.normal(0, 1, (batch_size, input_dim))
        misleading_targets = np.ones((batch_size, 1))
        a_loss = gan.train_on_batch(noise, misleading_targets)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Discriminator Loss: {d_loss}, Adversarial Loss: {a_loss}")

    # Gerar uma amostra final de dados
    noise = np.random.normal(0, 1, (sample_size, input_dim))
    generated_sample = generator.predict(noise)
    generated_df = pd.DataFrame(generated_sample, columns=df_scaled.columns[:input_dim])

    return generated_df


# Exemplo de uso

# Treinar a GAN e obter uma amostra dos dados gerados
generated_sample_df = train_gan(generator, discriminator, gan, df_scaled)

# Exibir a amostra gerada
print(generated_sample_df)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 




Epoch 0, Discriminator Loss: [array(0.7043004, dtype=float32), array(0.27734375, dtype=float32)], Adversarial Loss: 0.6670132875442505
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[