In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ForestDiffusion import ForestDiffusionModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# Step 1: Load and preprocess data
data = pd.read_csv('oil_dataset.csv')  # Replace with your dataset path
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target
integer_columns = X.select_dtypes(include=['int']).columns.tolist()
X_minority = X[y == 1]

# Standardize data
scaler = StandardScaler()
X_minority_scaled = scaler.fit_transform(X_minority)

# Step 2: Build Autoencoder with adjusted layers and learning rate
input_dim = X_minority.shape[1]
latent_dim = 3

inputs = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(inputs)
encoded = BatchNormalization()(encoded)
encoded = Dense(64, activation='relu')(encoded)
encoded = BatchNormalization()(encoded)
latent = Dense(latent_dim, activation='relu')(encoded)

decoded = Dense(64, activation='relu')(latent)
decoded = Dense(128, activation='relu')(decoded)
outputs = Dense(input_dim)(decoded)

autoencoder = Model(inputs, outputs)
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
autoencoder.fit(X_minority_scaled, X_minority_scaled, epochs=2000, batch_size=32, verbose=1, callbacks=[early_stopping])

encoder = Model(inputs, latent)
X_minority_reduced = encoder.predict(X_minority_scaled)

# Step 3: Generate synthetic data in latent space with Forest Diffusion
forest_model = ForestDiffusionModel(
    X_minority_reduced,
    label_y=None,
    n_t=50,
    duplicate_K=200,
    diffusion_type='flow',
    n_jobs=-1
)

num_samples_to_generate = len(X_minority)
generated_samples_reduced = forest_model.generate(batch_size=num_samples_to_generate)

# Step 4: Decode the generated samples and reverse scaling
decoder_input = Input(shape=(latent_dim,))
decoded_layer = Dense(64, activation='relu')(decoder_input)
decoded_layer = Dense(128, activation='relu')(decoded_layer)
outputs = Dense(input_dim)(decoded_layer)
decoder = Model(decoder_input, outputs)

generated_samples = decoder.predict(generated_samples_reduced)
generated_samples = scaler.inverse_transform(generated_samples)

# Step 5: Clip extremes, convert integer columns
def clip_extremes(X, X_min, X_max):
    return np.clip(X, X_min, X_max)

X_min = np.min(X_minority, axis=0)
X_max = np.max(X_minority, axis=0)
adjusted_samples = clip_extremes(generated_samples, X_min, X_max)
adjusted_samples_df = pd.DataFrame(adjusted_samples, columns=X.columns)
adjusted_samples_df[integer_columns] = adjusted_samples_df[integer_columns].round().astype(int)

# Step 6: Combine data and save
X_final = np.vstack((X.values, adjusted_samples))
y_final = np.concatenate((y, np.ones(adjusted_samples.shape[0])))
X_final_df = pd.DataFrame(X_final, columns=X.columns)
y_final_df = pd.Series(y_final, name='target')
final_data = pd.concat([X_final_df, y_final_df], axis=1)
final_data.to_csv('generated_oil_spill_data.csv', index=False)


Epoch 1/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.9846
Epoch 2/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.9715
Epoch 3/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9393
Epoch 4/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9121
Epoch 5/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.9487 
Epoch 6/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.9296
Epoch 7/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.9671
Epoch 8/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.9595   
Epoch 9/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.9735 
Epoch 10/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9566  
Ep